Source code for alab.analysis

#!/usr/bin/env python

# Copyright (C) 2015 University of Southern California and
#                          Nan Hua
# 
# Authors: Nan Hua
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function
__author__  = "Nan Hua"

__license__ = "GPL"
__version__ = "0.0.1"
__email__   = "nhua@usc.edu"


import numpy as np
import os.path
import h5py
import cPickle as pickle
import matrix
import utils
import files
import copy
import plots

[docs]class structuresummary(object):
    """
    This class offers a series of methods to study the model structure populations.
    
    Parameters
    ----------
    
    target : str
        the output directory for population structures, containing copy\*.hms files
        or can be seen as summary file \*.hss
    usegrp : str
        the probablility key used in modeling, e.g. p005j
    nstruct : int 
        number of structures to read
    """
    def __init__(self,target,usegrp=None,nstruct=10000,pid=10,**kwargs):
        if os.path.isdir(target):
            #the target is a valid population structure directory
            if usegrp==None:
                raise RuntimeError, "group key is not specified!"
            firststr    = files.modelstructures(os.path.join(target,"copy0.hms"),[usegrp])
            self.idx    = firststr.idx
            self.genome = firststr.genome
            self.usegrp = usegrp
            self.nbead  = len(self.idx)
            self.nstruct = nstruct
            self.radius  = firststr[-1].r
            print("reading %d structures info in %s,%s"%(nstruct,target,usegrp))
            while nstruct%pid != 0:
                pid = int(pid/2)
            self._readStructures(target,pid,**kwargs)
        elif os.path.isfile(target) and os.path.splitext(target)[1] == '.hss':
            h5f = h5py.File(target,'r')
            self.idx                   = h5f['idx'][:]
            self.genome                = pickle.loads(h5f['genome'].value)
            self.usegrp                = pickle.loads(h5f['usegrp'].value)
            self.nbead                 = pickle.loads(h5f['nbead'].value)
            self.nstruct               = pickle.loads(h5f['nstruct'].value)
            self.radius                = h5f['radius'][:]
            self.coordinates           = h5f['coordinates'][:]
            self.score                 = h5f['score'][:]
            self.consecutiveViolations = h5f['consecutiveViolations'][:]
            self.contactViolations     = h5f['contactViolations'][:]
            self.intraRestraints       = h5f['intraRestraints'][:]
            self.interRestraints       = h5f['interRestraints'][:]
        else:
            raise RuntimeError, "Invalid filename or file directory!"
        #-
        return None
    #==============================reading
    def _readStructures(self,target,pid,silence=True):
        """
        starting to read structures
        """
        import multiprocessing
        coor_shared       = multiprocessing.Array('d',self.nstruct*self.nbead*2*3) #10000 * 4640 * 3
        consecVio_shared  = multiprocessing.Array('d',self.nstruct)
        contactVio_shared = multiprocessing.Array('d',self.nstruct)
        score_shared      = multiprocessing.Array('d',self.nstruct)
        intrares_shared   = multiprocessing.Array('d',self.nstruct)
        interres_shared   = multiprocessing.Array('d',self.nstruct)
        readpool = []
        for k in range(pid):
            start = k*(self.nstruct/pid)
            end   = (k+1)*(self.nstruct/pid)
            process = multiprocessing.Process(target=self._readInfo,
                                              args=(target,start,end,self.usegrp,
                                                    coor_shared,
                                                    consecVio_shared,
                                                    contactVio_shared,
                                                    score_shared,
                                                    intrares_shared,
                                                    interres_shared,silence))
            process.start()
            readpool.append(process)
        
        for process in readpool:
            process.join()
        self.coordinates = np.frombuffer(coor_shared.get_obj()).reshape((self.nstruct,self.nbead*2,3))
        self.score       = np.frombuffer(score_shared.get_obj())
        self.consecutiveViolations = np.frombuffer(consecVio_shared.get_obj())
        self.contactViolations     = np.frombuffer(contactVio_shared.get_obj())
        self.intraRestraints = np.frombuffer(intrares_shared.get_obj())
        self.interRestraints = np.frombuffer(interres_shared.get_obj())
        return 0
    
    def _readInfo(self,target,start,end,usegrp,
                  coor_shared,
                  consecVio_shared,contactVio_shared,
                  score_shared,
                  intrares_shared,interres_shared,silence=True):
        """
        Read in all information
        """
        arr = np.frombuffer(coor_shared.get_obj())
        modelcoor = arr.reshape((self.nstruct,self.nbead*2,3)) #10000 * 4640 * 3
        for i in range(start,end):
            try:
                if not silence:
                    print(i)
                sts = files.modelstructures(os.path.join(target,'copy'+str(i)+'.hms'),[usegrp])
                
                modelcoor[i][:] = sts[0].xyz
                score_shared[i] = sts[0].score
                
                try:
                    consecVio_shared[i]  = sts[0].consecutiveViolations
                    contactVio_shared[i] = sts[0].contactViolations
                except:
                    consecVio_shared[i]  = np.nan
                    contactVio_shared[i] = np.nan
                try:
                    intrares_shared[i] = sts[0].intraRestraints
                    interres_shared[i] = sts[0].interRestraints
                except:
                    intrares_shared[i] = np.nan
                    interres_shared[i] = np.nan
                
            except RuntimeError:
                print("Can't find result for copy %s , %s at %s" %(str(i),usegrp,os.path.join(target,'copy'+str(i)+'.hms')))
        return 0
    #----------------------------

[docs]    def findBinIndex(self,chrom,start,end):
        """
        To find bead indexes given a chromosome region
        
        Parameters
        ----------
        
        chrom : str
            chromosome, should match the .idx representation
        start,end : int
            location range
        
        Returns
        -------
        Bin indexes array, or None if there is no valid ones
        """
        return utils.intersectMatrixIndex(self.idx,chrom,start,end)
    
    @property
    def totalRestraints(self):
        """
        Returns
        -------
        numpy array : all restraints for each structure
        """
        return self.interRestraints + self.intraRestraints
    
    @property
    def totalViolations(self):
        """
        Returns
        -------
        numpy array : all violations for each structure
        """
        return self.consecutiveViolations + self.contactViolations
    
    @property
    def violationPercentage(self):
        """
        Returns
        -------
        numpy array : violation percentage for each structure
        """
        return self.totalViolations / self.totalRestraints
    
[docs]    def getContactMap(self,contactRange=1):
        """
        Return contact matrix format contact heatmap
        """
        from scipy.spatial import distance
        modelmap        = matrix.contactmatrix(self.nbead)
        modelmap.idx    = copy.deepcopy(self.idx)
        modelmap.genome = copy.deepcopy(self.genome)
        modelmap.resolution = None
        n    = self.nbead*2
        dcap = distance.cdist(self.radius,-self.radius)*(contactRange+1)
        cmap = np.zeros((n,n))
        for i in range(self.nstruct):
            if (i+1)*10.0 % self.nstruct == 0:
                print("%.3f %%"%((i+1)*100.0 / self.nstruct))
            #print(i)
            dist   = distance.cdist(self.coordinates[i],self.coordinates[i],'euclidean')
            submap = dist <= dcap
            cmap  += submap
            
        modelmap.matrix = cmap[:n/2,:n/2]+cmap[:n/2,n/2:]+cmap[n/2:,:n/2]+cmap[n/2:,n/2:]
        modelmap.matrix = modelmap.matrix/self.nstruct/2
        
        return modelmap
    
[docs]    def getPairDistance(self,bead1,bead2):
        """
        Calculate pairwise distance across all structures
        
        Parameters
        ----------
        bead1,bead2 : int
            two bead in a pair, input to calculate distance
        
        Returns
        -------
        numpy array that has distance for the bead pair.
        """
        
        return np.linalg.norm(self.coordinates[:,bead1,:] - self.coordinates[:,bead2,:],axis=1)
    
[docs]    def getAveragePairwiseDistance(self,form='list'):
        """
        Calculate pairwise distance mean for each pair of beads in the structure population
        
        Parameters
        ----------
        
        form : str
            the return form of the function
            'list' return the list form
            'matrix' return the matrix form
        """
        pdistMean = np.zeros((2*self.nbead,2*self.nbead))
        for i in range(2*self.nbead):
            for j in range(i+1,2*self.nbead):
                dist = self.getPairDistance(i,j)
                pdistMean[i,j] = dist.mean()
                pdistMean[j,i] = pdistMean[i,j]
        
        if form == 'list':
            return pdistMean[np.triu_indices(2*self.nbead,1)]
        else:
            return pdistMean
    
[docs]    def getBeadRadialPosition(self,beads,nucleusRadius=5000.0):
        """
        Calculate radial position for every bead in the input list beads
        
        Parameters
        ----------
        beads : array-like
            list of all beads to calculate
        nucleusRadius : float
            radius of nucleus, default 5000(nm)
        
        Returns
        -------
        M*N matrix : radial position for all beads in the input and all structures in population
            M = len(beads)
            N = number of structures in population
        """
        allrp = []
        for i in np.array(beads):
            rp = np.linalg.norm(self.coordinates[:,i,:],axis=1) / nucleusRadius
            allrp.append(rp)
        return np.array(allrp)
    
[docs]    def getABCopyMeanBeadRadialPosition(self,nucleusRadius=5000.0):
        """
        Calculate mean radial position for every bead in structures, and differentiate diploid copy into A/B by inner or outer radial position
        
        Parameters
        ----------
        nucleusRadius : float
            radius of nucleus, default 5000(nm)
        
        Returns
        -------
        Two 1-D array : mean radian position for all beads, and first array contains the inner bead in the diploid genome.
        """
        
        rpA = self.getBeadRadialPosition(range(self.nbead),nucleusRadius)
        rpB = self.getBeadRadialPosition(range(self.nbead,self.nbead*2),nucleusRadius)
        
        rp = np.array([rpA,rpB])
        rp.sort(axis=0) #sort 
        rpmean = rp.mean(axis=2)
        
        return rpmean[0],rpmean[1]
            
[docs]    def getChromosomeRadialPosition(self,chrom,nucleusRadius=5000.0):
        """
        Calculate radial position for the chrom
        
        Parameters
        ----------
        chrom : str    
            the chromosome to calculate
        nucleusRadius : float
            radius of nucleus, default 5000(nm)
        
        Returns
        -------
        2N*1 vector : radial position for the chromosome in all structures in population
        """
        
        Aids = np.flatnonzero(self.idx['chrom'] == chrom)
        Bids = Aids + self.nbead
        
        radial = np.zeros(self.nstruct*2)
        for i in range(self.nstruct):
            xyzA = self.coordinates[i][Aids]
            xyzB = self.coordinates[i][Bids]
            comA = utils.centerOfMass(xyzA,self.radius[Aids])
            comB = utils.centerOfMass(xyzB,self.radius[Bids])
            
            radial[i*2]   = np.linalg.norm(comA)/nucleusRadius
            radial[i*2+1] = np.linalg.norm(comB)/nucleusRadius
        #-
        return radial
      
[docs]    def plotRadialPosition(self,figurename,chrom,format='pdf',color='dodgerblue',nucleusRadius=5000.0):
        """
        Plot Radial Position of beads for given chromosome
        
        Parameters
        ----------
        figurename : str
            name of the figure
        chrom : str
            given chromosome name
        color : str
            given the color for ploting 
        """
        Aids = np.flatnonzero(self.idx['chrom'] == chrom)
        Bids = Aids + self.nbead
        #calculate average radial position
        radial = (self.getBeadRadialPosition(Aids,nucleusRadius).mean(axis=1) + self.getBeadRadialPosition(Bids,nucleusRadius).mean(axis=1))/2
        #calculate midpos
        mega = 1.e-6
        midpos = mega*(self.idx[Aids]['end'] + self.idx[Aids]['start'])/2
        
        plots.plotxy(figurename,x = midpos,y = radial,format=format,
                     color = 'dodgerblue',
                     linewidth=2,
                     points=True,
                     xlim=(0,np.ceil(self.idx[Aids[-1]]['end']*mega)),
                     ylim=(0.1,0.9),
                     xlab='Genome Position(Mb)',
                     ylab='Radial Position',
                     title=chrom,
                     grid=True)

    #==========================saving
[docs]    def save(self,filename):
        """
        save all info into disk
        """
        
        if os.path.splitext(filename)[1] != '.hss':
            filename += '.hss'
        h5f = h5py.File(filename,'w')
        h5f.create_dataset('idx',data=self.idx,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('genome',data=pickle.dumps(self.genome))
        h5f.create_dataset('usegrp',data=pickle.dumps(self.usegrp))
        h5f.create_dataset('nbead',data=pickle.dumps(self.nbead))
        h5f.create_dataset('nstruct',data=pickle.dumps(self.nstruct))
        h5f.create_dataset('radius',data=self.radius,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('coordinates',data=self.coordinates,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('score',data=self.score,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('consecutiveViolations',data=self.consecutiveViolations,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('contactViolations',data=self.contactViolations,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('intraRestraints',data=self.intraRestraints,compression = 'gzip',compression_opts=9)
        h5f.create_dataset('interRestraints',data=self.interRestraints,compression = 'gzip',compression_opts=9)
        h5f.close()
        
        return 0
    
if __name__=='__main__':
    pass
    # s = structuresummary("structures/model0/" , usegrp="p005j" , nstruct=10000)
    # s.save('summary.hss')
    
    # #violations
    # print 'Violation Ratio:',s.totalViolations.mean()/s.totalRestraints.mean()
    
    # #heatmap
    # m = s.getContactMap()
    # m.plot('xx.pdf',format='pdf')
    # m.makeIntraMatrix('chr1').plot('xx.png')
    
    
    # #radial position
    # rp = s.getBeadRadialPosition(beads=range(len(s.idx)*2))