#!/usr/bin/env python
# Copyright (C) 2015 University of Southern California and
# Nan Hua
#
# Authors: Nan Hua
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function
__author__ = "Nan Hua"
__license__ = "GPL"
__version__ = "0.0.1"
__email__ = "nhua@usc.edu"
import numpy as np
import os.path
import h5py
import cPickle as pickle
import matrix
import utils
import files
import copy
import plots
[docs]class structuresummary(object):
"""
This class offers a series of methods to study the model structure populations.
Parameters
----------
target : str
the output directory for population structures, containing copy\*.hms files
or can be seen as summary file \*.hss
usegrp : str
the probablility key used in modeling, e.g. p005j
nstruct : int
number of structures to read
"""
def __init__(self,target,usegrp=None,nstruct=10000,pid=10,**kwargs):
if os.path.isdir(target):
#the target is a valid population structure directory
if usegrp==None:
raise RuntimeError, "group key is not specified!"
firststr = files.modelstructures(os.path.join(target,"copy0.hms"),[usegrp])
self.idx = firststr.idx
self.genome = firststr.genome
self.usegrp = usegrp
self.nbead = len(self.idx)
self.nstruct = nstruct
self.radius = firststr[-1].r
print("reading %d structures info in %s,%s"%(nstruct,target,usegrp))
while nstruct%pid != 0:
pid = int(pid/2)
self._readStructures(target,pid,**kwargs)
elif os.path.isfile(target) and os.path.splitext(target)[1] == '.hss':
h5f = h5py.File(target,'r')
self.idx = h5f['idx'][:]
self.genome = pickle.loads(h5f['genome'].value)
self.usegrp = pickle.loads(h5f['usegrp'].value)
self.nbead = pickle.loads(h5f['nbead'].value)
self.nstruct = pickle.loads(h5f['nstruct'].value)
self.radius = h5f['radius'][:]
self.coordinates = h5f['coordinates'][:]
self.score = h5f['score'][:]
self.consecutiveViolations = h5f['consecutiveViolations'][:]
self.contactViolations = h5f['contactViolations'][:]
self.intraRestraints = h5f['intraRestraints'][:]
self.interRestraints = h5f['interRestraints'][:]
else:
raise RuntimeError, "Invalid filename or file directory!"
#-
return None
#==============================reading
def _readStructures(self,target,pid,silence=True):
"""
starting to read structures
"""
import multiprocessing
coor_shared = multiprocessing.Array('d',self.nstruct*self.nbead*2*3) #10000 * 4640 * 3
consecVio_shared = multiprocessing.Array('d',self.nstruct)
contactVio_shared = multiprocessing.Array('d',self.nstruct)
score_shared = multiprocessing.Array('d',self.nstruct)
intrares_shared = multiprocessing.Array('d',self.nstruct)
interres_shared = multiprocessing.Array('d',self.nstruct)
readpool = []
for k in range(pid):
start = k*(self.nstruct/pid)
end = (k+1)*(self.nstruct/pid)
process = multiprocessing.Process(target=self._readInfo,
args=(target,start,end,self.usegrp,
coor_shared,
consecVio_shared,
contactVio_shared,
score_shared,
intrares_shared,
interres_shared,silence))
process.start()
readpool.append(process)
for process in readpool:
process.join()
self.coordinates = np.frombuffer(coor_shared.get_obj()).reshape((self.nstruct,self.nbead*2,3))
self.score = np.frombuffer(score_shared.get_obj())
self.consecutiveViolations = np.frombuffer(consecVio_shared.get_obj())
self.contactViolations = np.frombuffer(contactVio_shared.get_obj())
self.intraRestraints = np.frombuffer(intrares_shared.get_obj())
self.interRestraints = np.frombuffer(interres_shared.get_obj())
return 0
def _readInfo(self,target,start,end,usegrp,
coor_shared,
consecVio_shared,contactVio_shared,
score_shared,
intrares_shared,interres_shared,silence=True):
"""
Read in all information
"""
arr = np.frombuffer(coor_shared.get_obj())
modelcoor = arr.reshape((self.nstruct,self.nbead*2,3)) #10000 * 4640 * 3
for i in range(start,end):
try:
if not silence:
print(i)
sts = files.modelstructures(os.path.join(target,'copy'+str(i)+'.hms'),[usegrp])
modelcoor[i][:] = sts[0].xyz
score_shared[i] = sts[0].score
try:
consecVio_shared[i] = sts[0].consecutiveViolations
contactVio_shared[i] = sts[0].contactViolations
except:
consecVio_shared[i] = np.nan
contactVio_shared[i] = np.nan
try:
intrares_shared[i] = sts[0].intraRestraints
interres_shared[i] = sts[0].interRestraints
except:
intrares_shared[i] = np.nan
interres_shared[i] = np.nan
except RuntimeError:
print("Can't find result for copy %s , %s at %s" %(str(i),usegrp,os.path.join(target,'copy'+str(i)+'.hms')))
return 0
#----------------------------
[docs] def findBinIndex(self,chrom,start,end):
"""
To find bead indexes given a chromosome region
Parameters
----------
chrom : str
chromosome, should match the .idx representation
start,end : int
location range
Returns
-------
Bin indexes array, or None if there is no valid ones
"""
return utils.intersectMatrixIndex(self.idx,chrom,start,end)
@property
def totalRestraints(self):
"""
Returns
-------
numpy array : all restraints for each structure
"""
return self.interRestraints + self.intraRestraints
@property
def totalViolations(self):
"""
Returns
-------
numpy array : all violations for each structure
"""
return self.consecutiveViolations + self.contactViolations
@property
def violationPercentage(self):
"""
Returns
-------
numpy array : violation percentage for each structure
"""
return self.totalViolations / self.totalRestraints
[docs] def getPairDistance(self,bead1,bead2):
"""
Calculate pairwise distance across all structures
Parameters
----------
bead1,bead2 : int
two bead in a pair, input to calculate distance
Returns
-------
numpy array that has distance for the bead pair.
"""
return np.linalg.norm(self.coordinates[:,bead1,:] - self.coordinates[:,bead2,:],axis=1)
[docs] def getAveragePairwiseDistance(self,form='list'):
"""
Calculate pairwise distance mean for each pair of beads in the structure population
Parameters
----------
form : str
the return form of the function
'list' return the list form
'matrix' return the matrix form
"""
pdistMean = np.zeros((2*self.nbead,2*self.nbead))
for i in range(2*self.nbead):
for j in range(i+1,2*self.nbead):
dist = self.getPairDistance(i,j)
pdistMean[i,j] = dist.mean()
pdistMean[j,i] = pdistMean[i,j]
if form == 'list':
return pdistMean[np.triu_indices(2*self.nbead,1)]
else:
return pdistMean
[docs] def getBeadRadialPosition(self,beads,nucleusRadius=5000.0):
"""
Calculate radial position for every bead in the input list beads
Parameters
----------
beads : array-like
list of all beads to calculate
nucleusRadius : float
radius of nucleus, default 5000(nm)
Returns
-------
M*N matrix : radial position for all beads in the input and all structures in population
M = len(beads)
N = number of structures in population
"""
allrp = []
for i in np.array(beads):
rp = np.linalg.norm(self.coordinates[:,i,:],axis=1) / nucleusRadius
allrp.append(rp)
return np.array(allrp)
[docs] def getABCopyMeanBeadRadialPosition(self,nucleusRadius=5000.0):
"""
Calculate mean radial position for every bead in structures, and differentiate diploid copy into A/B by inner or outer radial position
Parameters
----------
nucleusRadius : float
radius of nucleus, default 5000(nm)
Returns
-------
Two 1-D array : mean radian position for all beads, and first array contains the inner bead in the diploid genome.
"""
rpA = self.getBeadRadialPosition(range(self.nbead),nucleusRadius)
rpB = self.getBeadRadialPosition(range(self.nbead,self.nbead*2),nucleusRadius)
rp = np.array([rpA,rpB])
rp.sort(axis=0) #sort
rpmean = rp.mean(axis=2)
return rpmean[0],rpmean[1]
[docs] def getChromosomeRadialPosition(self,chrom,nucleusRadius=5000.0):
"""
Calculate radial position for the chrom
Parameters
----------
chrom : str
the chromosome to calculate
nucleusRadius : float
radius of nucleus, default 5000(nm)
Returns
-------
2N*1 vector : radial position for the chromosome in all structures in population
"""
Aids = np.flatnonzero(self.idx['chrom'] == chrom)
Bids = Aids + self.nbead
radial = np.zeros(self.nstruct*2)
for i in range(self.nstruct):
xyzA = self.coordinates[i][Aids]
xyzB = self.coordinates[i][Bids]
comA = utils.centerOfMass(xyzA,self.radius[Aids])
comB = utils.centerOfMass(xyzB,self.radius[Bids])
radial[i*2] = np.linalg.norm(comA)/nucleusRadius
radial[i*2+1] = np.linalg.norm(comB)/nucleusRadius
#-
return radial
[docs] def plotRadialPosition(self,figurename,chrom,format='pdf',color='dodgerblue',nucleusRadius=5000.0):
"""
Plot Radial Position of beads for given chromosome
Parameters
----------
figurename : str
name of the figure
chrom : str
given chromosome name
color : str
given the color for ploting
"""
Aids = np.flatnonzero(self.idx['chrom'] == chrom)
Bids = Aids + self.nbead
#calculate average radial position
radial = (self.getBeadRadialPosition(Aids,nucleusRadius).mean(axis=1) + self.getBeadRadialPosition(Bids,nucleusRadius).mean(axis=1))/2
#calculate midpos
mega = 1.e-6
midpos = mega*(self.idx[Aids]['end'] + self.idx[Aids]['start'])/2
plots.plotxy(figurename,x = midpos,y = radial,format=format,
color = 'dodgerblue',
linewidth=2,
points=True,
xlim=(0,np.ceil(self.idx[Aids[-1]]['end']*mega)),
ylim=(0.1,0.9),
xlab='Genome Position(Mb)',
ylab='Radial Position',
title=chrom,
grid=True)
#==========================saving
[docs] def save(self,filename):
"""
save all info into disk
"""
if os.path.splitext(filename)[1] != '.hss':
filename += '.hss'
h5f = h5py.File(filename,'w')
h5f.create_dataset('idx',data=self.idx,compression = 'gzip',compression_opts=9)
h5f.create_dataset('genome',data=pickle.dumps(self.genome))
h5f.create_dataset('usegrp',data=pickle.dumps(self.usegrp))
h5f.create_dataset('nbead',data=pickle.dumps(self.nbead))
h5f.create_dataset('nstruct',data=pickle.dumps(self.nstruct))
h5f.create_dataset('radius',data=self.radius,compression = 'gzip',compression_opts=9)
h5f.create_dataset('coordinates',data=self.coordinates,compression = 'gzip',compression_opts=9)
h5f.create_dataset('score',data=self.score,compression = 'gzip',compression_opts=9)
h5f.create_dataset('consecutiveViolations',data=self.consecutiveViolations,compression = 'gzip',compression_opts=9)
h5f.create_dataset('contactViolations',data=self.contactViolations,compression = 'gzip',compression_opts=9)
h5f.create_dataset('intraRestraints',data=self.intraRestraints,compression = 'gzip',compression_opts=9)
h5f.create_dataset('interRestraints',data=self.interRestraints,compression = 'gzip',compression_opts=9)
h5f.close()
return 0
if __name__=='__main__':
pass
# s = structuresummary("structures/model0/" , usegrp="p005j" , nstruct=10000)
# s.save('summary.hss')
# #violations
# print 'Violation Ratio:',s.totalViolations.mean()/s.totalRestraints.mean()
# #heatmap
# m = s.getContactMap()
# m.plot('xx.pdf',format='pdf')
# m.makeIntraMatrix('chr1').plot('xx.png')
# #radial position
# rp = s.getBeadRadialPosition(beads=range(len(s.idx)*2))