Source code for alab.matrix

#!/usr/bin/env python

# Copyright (C) 2015 University of Southern California and
#                          Nan Hua
# 
# Authors: Nan Hua
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__author__  = "Nan Hua"
__credits__ = ["Nan Hua","Ke Gong","Harianto Tjong"]

__license__ = "GPL"
__version__ = "0.0.2"
__email__   = "nhua@usc.edu"

import numpy as np
import os.path
import re
import h5py
import copy
import cPickle
import warnings
from plots import plotxy, plotmatrix, histogram
import utils as alabutils

[docs]class contactmatrix(object): """ A flexible matrix instant that supports various methods for processing HiC contacts Parameters ---------- filename : str matrix file stored in hdf5 format or an integer for the matrix size to initialize an empty matrix instance genome : str genome e.g.'hg19','mm9' resolution : int the resolution for the hic matrix e.g. 100000 usechr : list containing the chromosomes used for generating the matrix Attributes ---------- matrix : numpy 2d array storing all infor for the hic contact matrix idx : numpy structure array matrix index genome : str the genome resolution : int resolution for the contact matrix """ _idxdtype = np.dtype([('chrom','S5'),('start',int),('end',int),('flag','S10')]) def __init__(self,filename,genome=None,resolution=None,usechr=['#','X']): self._applyedMethods = {} if isinstance(filename,int): self.matrix=np.zeros((filename,filename),dtype = np.float32) elif isinstance(filename,str): if not os.path.isfile(filename): raise IOError,"File %s doesn't exist!\n" % (filename) if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext(filename)[1] == '.hmat': h5f = h5py.File(filename,'r') self.matrix = h5f['matrix'][:] self.idx = h5f['idx'][:] if 'applyedMethods' in h5f.keys(): self._applyedMethods = cPickle.loads(h5f['applyedMethods'].value) if 'genome' in h5f.keys() and 'resolution' in h5f.keys(): self.genome = cPickle.loads(h5f['genome'].value) self.resolution = cPickle.loads(h5f['resolution'].value) h5f.close() else: from alabio import loadstream f = loadstream(filename) s = f.next() line = re.split('\t+|\s+',s.rstrip()) n = len(line) - 3 expectn = n if isinstance(genome,str) and isinstance(resolution,int): genomedb = alabutils.genome(genome,usechr=usechr) bininfo = genomedb.bininfo(resolution) expectn = len(bininfo.chromList) if expectn != n: raise RuntimeError, "Dimension don't match, expected %s bins , get %s bins. Please check the input." %(expectn,n) idx = [] i = 0 tidx = line[0:3];tidx.append('') idx.append(tidx) self.matrix = np.zeros((n,n),dtype = np.float32) self.matrix[i] = line[3:] for s in f: i += 1 line = re.split('\t+|\s+',s.rstrip()) tidx = line[0:3];tidx.append('') idx.append(tidx) self.matrix[i] = line[3:] f.close() self.idx = np.core.records.fromarrays(np.array(idx).transpose(),dtype=self._idxdtype) else: raise RuntimeError, "Undefined input filename type!\n" #----------------end filename if isinstance(genome,str) and isinstance(resolution,int): if hasattr(self,"genome") and hasattr(self,"resolution"): raise RuntimeError, "Genome and resolution has already been specified." genomedb = alabutils.genome(genome,usechr=usechr) bininfo = genomedb.bininfo(resolution) flaglist = ['' for i in range(len(bininfo.chromList))] self.genome = genome self.resolution = resolution self._buildindex(bininfo.chromList,bininfo.startList,bininfo.endList,flaglist) #================================================== def _buildindex(self,chromlist,startlist,endlist,flaglist): idxlist = np.column_stack([chromlist,startlist,endlist,flaglist]) self.idx = np.core.records.fromarrays(np.array(idxlist).transpose(),dtype=self._idxdtype)
[docs] def buildindex(self,**kwargs): warnings.warn("buildindex is deprecated, specify genome and resolution instead of building index manually.", DeprecationWarning) self._buildindex(**kwargs)
#-------------------------------------------------- def __str__(self): return self.matrix.__str__() def __repr__(self): return self.matrix.__repr__() def __len__(self): return self.matrix.__len__()
[docs] def rowsum(self): return self.matrix.sum(axis=1)
[docs] def columnsum(self): return self.matrix.sum(axis=0)
[docs] def applyed(self,method): if method in self._applyedMethods: return True else: return False
def _getZeroEntry(self): self.mask = np.flatnonzero(self.rowsum() == 0) def _getMask(self,mask = None): if mask is None: self._getZeroEntry() return 0 else: if isinstance(mask,np.ndarray): self.mask = mask return 1 else: raise TypeError, "Invalid argument type, numpy.ndarray is required" def __checkGenomeResolution(self,genome,resolution): if hasattr(self,"genome") and hasattr(self,"resolution"): pass else: warnings.warn("No genome and resolution is specified within the file, try to assign attributes.") if (genome is None) or (resolution is None): raise ValueError, "No genome info is found! Genome and resolution parameter must be specified." else: self.genome = genome self.resolution = resolution #=========================================================filtering methods
[docs] def removeDiagonal(self,force = False): if (not self.applyed('removeDiagonal')) or force: np.fill_diagonal(self.matrix,0) self._applyedMethods['removeDiagonal'] = True else: warnings.warn("Method removeDiagonal was done before, use force = True to overwrite it.")
[docs] def removePoorRegions(self, cutoff=1, usepvalue = 0.1, force = False): """Removes "cutoff" percent of bins with least counts Parameters ---------- cutoff : int, 0<cutoff<100 Percent of lowest-counts bins to be removed usepvalue: float, 0<usepvalue<1 use this pvalue as correlation cutoff to remove bins bins whose pvalue greater than this cutoff will be removed """ if (not self.applyed('removePoorRegions')) or force: rowsum = self.rowsum() mask = np.flatnonzero(rowsum < np.percentile(rowsum[rowsum > 0],cutoff)) #use correlation pvalue from scipy.stats import pearsonr,entropy newmask = [] for i in mask: if rowsum[i] == 0: continue split = alabutils.binomialSplit(self.matrix[i]) corr,pvalue = pearsonr(split[0],split[1])#returns corr rowentropy = entropy(self.matrix[i]) if pvalue > usepvalue: newmask.append(i) print i,rowsum[i],(corr,pvalue),"Remove",rowentropy else: print i,rowsum[i],(corr,pvalue),"Keep",rowentropy self.matrix[newmask,:] = 0 self.matrix[:,newmask] = 0 self.idx['flag'][newmask] = 'Removed' print "%d low converage bins were removed." % (len(newmask)) self._applyedMethods['removePoorRegions'] = (cutoff,len(newmask)) else: warnings.warn("Method removePoorRegions was done before, use force = True to overwrite it.")
[docs] def identifyInterOutliersCutoff(self,N=100): """ Identify interchromosome outliers' cutoff Do an N round random choice as the original contact freq distribution and estimate the sample std for every contact freq If the sample std is larger than half of the frequency (contact #), lable this contact frequency as spourious cutoff is set to the number that first consecutive 2 non-spourious frequency from the right side (scan from high frequency to low) """ if self.applyed('normalization'): raise RuntimeError, "Matrix is already normalized, raw matrix is needed." intermask = self.idx['chrom'][:,None] < self.idx['chrom'][None,:] interflatten = self.matrix[intermask] interflatten = interflatten[interflatten > 0] originHist = np.histogram(interflatten,int(interflatten.max()))[0] repeatResults = np.zeros((N,int(interflatten.max())),dtype=int) for i in range(N): tmpChoice = np.random.choice(interflatten,len(interflatten)) repeatResults[i] = alabutils.listadd(repeatResults[i],np.histogram(tmpChoice,int(tmpChoice.max()))[0]) comparison = np.std(repeatResults,axis=0) >= originHist/2 i = len(comparison) -1 while (comparison[i] or comparison[i-1]):i -= 1 cutoff = i+1 return cutoff
[docs] def smoothInterContactByCutoff(self,cutoff,w=3,s=3,p=3,force=False): """ given the cutoff, run a power law smoothing for the interchromosome matrix for contacts > cutoff """ if (not self.applyed('smoothByCutoff')) or force: intermask = self.idx['chrom'][:,None] < self.idx['chrom'][None,:] x, y = np.nonzero(intermask) pos = np.flatnonzero(self.matrix[intermask] > cutoff) for i in pos: cnew = alabutils.powerLawSmooth(self.matrix[ x[i]-w:x[i]+w+1 , y[i]-w:y[i]+w+1 ],(w,w),w,s,p) print x[i],y[i],self.matrix[x[i],y[i]],'-->',cnew self.matrix[x[i],y[i]] = cnew self.matrix[y[i],x[i]] = cnew self._applyedMethods['smoothByCutoff'] = cutoff else: warnings.warn("Method smoothInterContactByCutoff was done before,cutoff = %d use force = True to overwrite it."\ %(self._applyedMethods['smoothByCutoff']))
#========================================================normalization methods
[docs] def krnorm(self,mask = None,force=False,**kwargs): """ using krnorm balacing the matrix (overwriting the matrix!) Parameters ---------- mask : list/array mask is a 1-D vector with the same length as the matrix where 1s specify the row/column to be ignored or a 1-D vector specifing the indexes of row/column to be ignored if no mask is given, row/column with rowsum==0 will be automatically detected and ignored large_mem : bool when large_mem is set to 1, matrix product is calculated using small chunks, but this will slowdown the process a little bit. """ if (not self.applyed('normalization')) or force: from norm import bnewt self._getMask(mask) x = bnewt(self.matrix,mask=self.mask,check=0,**kwargs)*100 self.matrix *= x self.matrix *= x.T self._applyedMethods['normalization'] = 'krnorm' else: warnings.warn("Method %s was done before, use force = True to overwrite it." % (self._applyedMethods['normalization']))
[docs] def vcnorm(self,iterations=1,mask = None,force=False): if (not self.applyed('normalization')) or force: self._getMask(mask) for i in range(iterations): print "\tIterations:",i+1 rowsum = self.rowsum() rowsum[self.mask] = 0 totalsum = rowsum.sum() np.seterr(divide='ignore') rowsum = 1/rowsum rowsum[self.mask] = 0 self.matrix *= totalsum self.matrix *= rowsum self.matrix *= rowsum.T self._applyedMethods['normalization'] = 'vcnorm' else: warnings.warn("Method %s was done before, use force = True to overwrite it." % (self._applyedMethods['normalization']))
#-----------------------------------------------------------------------------
[docs] def scale(self, cellaverage = 1): """ Scale matrix so that average of cells is the given value. By default, the rowsum will be the number of rows/columns """ rowsum = self.rowsum() totalsum = rowsum.sum() try: self.mask except AttributeError: self._getMask() self.matrix = self.matrix / totalsum * (cellaverage * (len(rowsum)-len(self.mask)) * (len(rowsum)-len(self.mask)))
[docs] def range(self,chrom): """ return the index range for a give chromsome """ rangeList = np.flatnonzero(self.idx['chrom'] == chrom) if len(rangeList)==0: raise ValueError, "%s is not found in the index" %(chrom) else: return (rangeList[0],rangeList[-1]+1)
[docs] def makeIntraMatrix(self,chrom): """ substract a chromsome matrix given a chromsome name Parameters ---------- chrom : str, chromosome name e.g 'chr1' """ if self.applyed('subMatrix'): warnings.warn("This is already a submatrix!") try: rstart,rend = self.range(chrom) except ValueError: raise ValueError, "%s is not found in the index. Possibly you are not using the genome wide matrix" %(chrom) submatrix = contactmatrix(rend - rstart) submatrix.matrix = self.matrix[rstart:rend,rstart:rend] submatrix.idx = np.core.records.fromrecords(self.idx[rstart:rend],dtype=self._idxdtype) if hasattr(self,"genome") and hasattr(self,"resolution"): submatrix.genome = self.genome submatrix.resolution = self.resolution else: warnings.warn("No genome and resolution is specified, attributes are recommended for matrix.") submatrix._applyedMethods = copy.deepcopy(self._applyedMethods) submatrix._applyedMethods['subMatrix'] = chrom return submatrix
[docs] def getICP(self,index): """ return inter-chromosomal proportion of a given bin index """ chrom = self.idx[index]['chrom'] cstart,cend = self.range(chrom) intrasum = sum(self.matrix[index][cstart:cend]) totalsum = sum(self.matrix[index]) return 1-intrasum*1.0/totalsum
#----------------------------------------------------------genome wide smoothing stuff:
[docs] def smoothGenomeWideHighValue(self,w=3,s=3,p=3,z=5,force=False): """ Use power law smoothing function to smooth high spikes in chromosomes blocks Parameters ---------- w : int the window size, the smoothing is computed using target +/- w s : int weight of the location deviation p : int power of the location deviation z : int range of standard deviation to set cutoff """ if self.applyed('subMatrix'): raise RuntimeError, "This is a submatrix, genome wide smoothing cannot be applyed." if (not self.applyed('smoothGenomeWide')) or force: smoothed = 0 chrlist = np.unique(self.idx['chrom']) if sum(np.diagonal(self.matrix)) < 1: v0 = np.diagonal(self.matrix,1) v1 = np.append(v0[0],v0) v2 = np.append(v0,v0[-1]) np.fill_diagonal(self.matrix,v1+v2) #replace 0 with large number so most likely the neighborhood of diagonal are not smoothed for row in range(len(chrlist)): rstart,rend = self.range(chrlist[row]) for column in range(row,len(chrlist)): cstart,cend = self.range(chrlist[column]) print "Smoothing block (%s,%s)" % (chrlist[row],chrlist[column]) tmpMatrix,smoothedCounts = alabutils.smoothSpikesInBlock(self.matrix[rstart:rend,cstart:cend],w,s,p,z) self.matrix[rstart:rend,cstart:cend] = tmpMatrix self.matrix[cstart:cend,rstart:rend] = tmpMatrix.T if row == column: smoothed += smoothedCounts else: smoothed += 2*smoothedCounts self._applyedMethods['smoothGenomeWide'] = (smoothed,"w=%d,s=%d,p=%d,z=%d" % (w,s,p,z)) print "Genomewide smoothing finished, %d contacts smoothed" % (smoothed) else: warnings.warn("Method smoothGenomeWideHighValue was done before, %s %d values smoothed. use force = True to overwrite it."\ %(self._applyedMethods['smoothGenomeWide'][1],self._applyedMethods['smoothGenomeWide'][0]))
#==============================================================Probabiliy matrix methods
[docs] def getDomainMatrix(self,domainChrom,domainStartPos,domainEndPos,rowmask,minSize=1,maxSize=None): """ Return a submatrix defined by domainChrom, domainStartPos, domainEndPos Parameters ---------- domainChrom : str domain chromosome e.g. 'chr1' domainStartPos : int start position e.g. 0 domainEndPos : int end position e.g. 700000 minSize : int, > 0 min domain size maxSize : int, optional max domain size, in bins if the domain is larger than a given number of bins, this function will return None """ chrStartBin,chrEndBin = self.range(domainChrom) domainStartBin = chrStartBin + int( domainStartPos/ float(self.resolution) ) domainEndBin = chrStartBin + int( np.ceil( domainEndPos/ float(self.resolution) ) ) newmatrix = None if (domainEndBin - domainStartBin) >= minSize: if maxSize is None: newmatrix = self.matrix[domainStartBin:domainEndBin, domainStartBin:domainEndBin] elif (domainEndBin - domainStartBin) <= maxSize: newmatrix = self.matrix[domainStartBin:domainEndBin, domainStartBin:domainEndBin] else: return None else: return None maskloc = np.intersect1d(range(domainStartBin,domainEndBin),rowmask) maskloc = maskloc - domainStartBin newmatrix = np.delete(np.delete(newmatrix,maskloc,axis=0),maskloc,axis=1) return newmatrix
[docs] def getfmax(self,method = 'UF',minSize=1,maxSize=2000,removeZero=False,boxplotTrim=False,offdiag=1,target='median'): """ calculate fmax based on different methods Parameters ---------- method : str NM #neighbouring max UF #uniform fmax target : str 'mean'/'median' """ if self.applyed('subMatrix'): raise RuntimeError, "This is a submatrix, genome wide fmax cannot be applyed." if self.applyed('probabilityMatrix'): raise RuntimeError, "This is already a probability matrix!" fmax = None if method == 'NM':#method neighbour max fmax = np.zeros(len(self)) for chrom in np.unique(self.idx['chrom']): cstart, cend = self.range(chrom) #cend is increased by one for i in range(cstart+1,cend-1): fmax[i] = min(self.matrix[i,i+1],self.matrix[i,i-1]) fmax[cstart] = self.matrix[cstart,cstart+1] #p telomere fmax[cend-1] = self.matrix[cend-1,cend-2] #q telomere elif method == 'UF':#method uniform fmax if not hasattr(self,"domainIdx"): raise RuntimeError, "Please use assignDomain(domain_bedgraph,pattern) to assign domain INFO" print "Using minSize = %d, eliminating domains smaller than %dkb." % (minSize,minSize*self.resolution/1000) print "Using maxSize = %d, eliminating domains larger than %dkb." % (maxSize,maxSize*self.resolution/1000) #Get all intra domain interactions (upper triangle) upperTriangle = [] skipDomains = 0 print "Including Off Diagonal %d" %(offdiag) rowmask = np.flatnonzero(self.rowsum() == 0) #removed bins for domainRec in self.domainIdx: domainMatrix = self.getDomainMatrix(domainRec['chrom'],domainRec['start'],domainRec['end'],rowmask,minSize,maxSize)#domain matrix, eliminating domains that are larger than 20 bins if domainMatrix is None: skipDomains += 1 continue upperTriangle.extend(domainMatrix[np.triu_indices(len(domainMatrix),offdiag)])#get the upper triangle #--------scaning finished print "%d domains are scanned, %d domains are eliminated." % (len(self.domainIdx),skipDomains) upperTriangle = np.array(upperTriangle) if removeZero: print "Removing zeros" upperTriangle = upperTriangle[upperTriangle > 0] lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(upperTriangle)#get quartiles and fence print lowerFence,Q1,Q2,Q3,upperFence if boxplotTrim: print "Trimming outliers" upperTriangle = upperTriangle[(upperTriangle > lowerFence) & (upperTriangle < upperFence)] #trim to better range lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(upperTriangle)#get quartiles and fence print lowerFence,Q1,Q2,Q3,upperFence if target == "median": fmax = Q2#get median elif target == "mean": fmax = upperTriangle.mean()#get mean else: raise RuntimeError, "target take only 'median' or 'mean' method" else: raise RuntimeError, "Please use legal method parameters:'NM' or 'UF'!" return fmax
#-----------------------use fmax to get prob matrix
[docs] def fmaximization(self,**kwargs): warnings.warn("fmaximization is deprecated, function name changed to fmaxScaling.", DeprecationWarning) self.fmaxScaling(**kwargs)
[docs] def fmaxScaling(self,fmax,force=False): """ use fmax to generate probability matrix for uniform fmax, simply divide the matrix by fmax and clip to 1 for neighbouring contact fmax P[i,j] = F[i,j]/min(fmax[i],fmax[j]) """ if self.applyed('probabilityMatrix') and (not force): raise RuntimeError, "This is already a probability matrix!,use force to overwrite" if isinstance(fmax,float) or isinstance(fmax,np.float32) or isinstance(fmax,int): print "Uniform fmax detected" self.matrix = self.matrix/fmax self.matrix = self.matrix.clip(max=1) self._applyedMethods['probabilityMatrix'] = 'Uniform Fmax=%f' % (fmax) else: raise AttributeError, "Not supported fmax type!"
[docs] def assignDomain(self,domain,pattern=''): """ Load Domain information Parameters ---------- domain : alab.files.bedgraph instance bedgraph for domain definition pattern : str a string use to filter the flags in the bedgraph """ from files import bedgraph if not isinstance(domain,bedgraph): raise TypeError,"Bedgraph instance required, see alab.files.bedgraph for more details" self.domainIdx = domain.filter(pattern)
def _generateMedianSummaryMatrix(self,summaryBinStart,summaryBinEnd): N = len(summaryBinStart) X = np.empty((N,N),np.float32) for i in range(N): print "Filling X[%d] from A[%d] to A[%d]" % (i,summaryBinStart[i],summaryBinEnd[i]-1) istart = int(summaryBinStart[i]) iend = int(summaryBinEnd[i]) for j in range(i,N): #print "Filling X[%d] from A[:,%d] to A[:,%d]" % (i,summaryBinStart[j],summaryBinEnd[j]-1) jstart = int(summaryBinStart[j]) jend = int(summaryBinEnd[j]) submatrix = self.matrix[istart:iend,jstart:jend] #making sure that empty bins are removed out = np.nanmedian(submatrix) if np.isnan(out): out = 0 X[i,j] = X[j,i] = out return X def _generateTopMeanSummaryMatrix(self,summaryBinStart,summaryBinEnd,top=10,removeOutlier=True): N = len(summaryBinStart) X = np.empty((N,N),np.float32) for i in range(N): print "Filling X[%d] from A[%d] to A[%d]" % (i,summaryBinStart[i],summaryBinEnd[i]-1) istart = int(summaryBinStart[i]) iend = int(summaryBinEnd[i]) for j in range(i,N): jstart = int(summaryBinStart[j]) jend = int(summaryBinEnd[j]) submatrix = self.matrix[istart:iend,jstart:jend].flatten() submatrix = submatrix[~np.isnan(submatrix)] #get all non-nan value if len(submatrix) < 1: out = 0 else: if removeOutlier: lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(submatrix) submatrix = submatrix[(submatrix>=lowerFence) & (submatrix<=upperFence)] else: pass bound = np.percentile(submatrix,100-top) out = np.mean(submatrix[submatrix>=bound]) X[i,j] = X[j,i] = out return X
[docs] def makeDomainLevelMatrix(self,method='topmean',top=10,removeOutlier=True): """ Use domain INFO to generate Domain level matrix Parameters ---------- method : str "topmean" or "median" top : int 0<top<100 the top percentage to calculate the mean, top=10 means top 10% of the subdomain matrix removeOutlier : bool option to remove outlier using 1.5IQR """ if self.applyed('domainLevel'): raise RuntimeError, "This is already a domain level matrix!" if not hasattr(self,"domainIdx"): raise RuntimeError, "Please use assignDomain(domain_bedgraph,pattern) to assign domain INFO" domainLevelMatrix = contactmatrix(len(self.domainIdx)) domainLevelMatrix._buildindex(self.domainIdx['chrom'],self.domainIdx['start'],self.domainIdx['end'],self.domainIdx['flag']) domainLevelMatrix.genome = self.genome domainLevelMatrix.resolution = self.resolution domainLevelMatrix._applyedMethods = copy.deepcopy(self._applyedMethods) summaryBinStart = np.zeros(len(self.domainIdx)) summaryBinEnd = np.zeros(len(self.domainIdx)) for i in range(len(self.domainIdx)): chrStartBin,chrEndBin = self.range(self.domainIdx[i]['chrom']) summaryBinStart[i] = chrStartBin + int(self.domainIdx[i]['start'] / float(self.resolution)) summaryBinEnd[i] = chrStartBin + int(np.ceil(self.domainIdx[i]['end'] / float(self.resolution))) self._getMask()#removed bins in self.mask self.matrix[self.mask,:] = np.nan self.matrix[:,self.mask] = np.nan if method == 'topmean': domainLevelMatrix.matrix = self._generateTopMeanSummaryMatrix(summaryBinStart,summaryBinEnd,top,removeOutlier) domainLevelMatrix._applyedMethods['domainLevel'] = "%s/top=%d%s" % (method,top,'%') elif method == 'median': domainLevelMatrix.matrix = self._generateMedianSummaryMatrix(summaryBinStart,summaryBinEnd) domainLevelMatrix._applyedMethods['domainLevel'] = "%s" % (method) return domainLevelMatrix
[docs] def iterativeFmaxScaling(self,domainAverageContacts=23.2,tol=0.01): """ Automatic fmax scaling to get domain level matrix and match the rowsum average domain level matrix to domainAverageContacts """ fmax = self.getfmax() domainMean = 0 originMatrix = copy.deepcopy(self.matrix) while abs(domainMean - domainAverageContacts)/domainAverageContacts > tol: print "fmax=%f"%(fmax) self.matrix = copy.deepcopy(originMatrix) self.fmaxScaling(fmax,force=True) domainLevelMatrix = self.makeDomainLevelMatrix() domainMean = domainLevelMatrix.rowsum().mean() fmax = fmax/domainAverageContacts*domainMean return domainLevelMatrix
#==============================================================plotting methods
[docs] def plot(self,figurename,log=False,**kwargs): """ plot the matrix heat map Parameters ---------- figurename : str log : bool if True, plot the log scale of the matrix if False, plot the original matrix clip_max : float clip_min : float 2 options that will clip the matrix to certain value cmap : matplotlib.cm instance color map of the matrix label : str label of the figure """ if log: plotmatrix(figurename,np.log(self.matrix),**kwargs) else: plotmatrix(figurename,self.matrix,**kwargs)
[docs] def plotZeroCount(self,figurename,**kwargs): zeroCount = [] for i in range(len(self.matrix)): zeros = len(np.flatnonzero(self.matrix[i] == 0)) if zeros != len(self.matrix): zeroCount.append(zeros) #--endfor histogram(figurename, zeroCount, int(len(self.matrix)/100), xlab = '# of Zeros', ylab = 'Frequency', **kwargs)
[docs] def plotSum(self,figurename,outlier=False,line=None,**kwargs): """ Print the rowsum frequency histogram Parameters ---------- figurename : string Name of the plot outlier: bool option to select plotting the outlier line, only functioning if 'line' parameter is set to None line : float/array/list draw vertical lines at a list of positions """ rowsum = self.rowsum() if line is None: if outlier: line = (np.percentile(rowsum,75) - np.percentile(rowsum,25))*1.5 + np.percentile(rowsum,75) histogram(figurename, rowsum[rowsum > 0], int(len(self.matrix)/100), xlab = 'Row sums', ylab = 'Frequency', line = line, **kwargs)
#==============================================================save method
[docs] def save(self, filename): """ Save the matrix along with information in hdf5 file """ if (filename[-5:] != '.hmat'): filename += '.hmat' h5f = h5py.File(filename, 'w') h5f.create_dataset('matrix', data=self.matrix, compression = 'gzip', compression_opts=9) h5f.create_dataset('idx', data=self.idx, compression = 'gzip', compression_opts=9) h5f.create_dataset('applyedMethods', data=cPickle.dumps(self._applyedMethods)) if hasattr(self,"genome") and hasattr(self,"resolution"): h5f.create_dataset('genome',data = cPickle.dumps(self.genome)) h5f.create_dataset('resolution',data = cPickle.dumps(self.resolution)) else: warnings.warn("No genome and resolution is specified, attributes are recommended for matrix.") h5f.close()
#----------------------------------End of Class contactmatrix------------------------------------------------- #============================================================================================================== def loadh5dict(filename): h5f = h5py.File(filename,'r') genome = cPickle.loads(h5f['genome'].value) resolution = cPickle.loads(h5f['resolution'].value) #genomeIdxToLabel = cPickle.loads(h5f['genomeIdxToLabel'].value) binNumber = cPickle.loads(h5f['binNumber'].value) newMatrix = contactmatrix(binNumber,genome,resolution) newMatrix.matrix[:] = h5f['heatmap'][:] return newMatrix def loadhic(filename,genome='hg19',resolution=100000,usechr=['#','X'],verbose=False): from . import straw tgenome = alabutils.genome(genome) bininfo = tgenome.bininfo(resolution) m = contactmatrix(len(bininfo.chromList),genome=genome,resolution=resolution,usechr=usechr) for chr1 in tgenome.info['chrom']: i = tgenome.getchrnum(chr1) for chr2 in tgenome.info['chrom']: j = tgenome.getchrnum(chr2) if i > j: continue if verbose: print chr1,chr2 result = straw.straw("NONE",filename,chr1[3:],chr2[3:],'BP',resolution) for t in range(len(result[0])): x = int(result[0][t]/resolution) + bininfo.binStart[i] y = int(result[1][t]/resolution) + bininfo.binStart[j] m.matrix[x,y] = result[2][t] m.matrix[y,x] = result[2][t] #- #-- #-- return m def loadcooler(filename,usechr=['#','X'],verbose=False): import scipy.sparse h5 = h5py.File(filename,'r') genome = str(h5.attrs['genome-assembly']) resolution = h5.attrs['bin-size'] nbins = h5.attrs['nbins'] if verbose: print genome, resolution tgenome = utils.genome(genome) bininfo = tgenome.bininfo(resolution) sp = scipy.sparse.csr_matrix((h5['pixels']['count'],h5['pixels']['bin2_id'],h5['indexes']['bin1_offset']),shape=(nbins,nbins)) m = contactmatrix(len(bininfo.chromList),genome=genome,resolution=resolution,usechr=usechr) m.matrix = sp.toarray()[:len(m.idx),:len(m.idx)].astype(np.float32) h5.close() t = m.matrix.diagonal().copy() m.matrix[np.diag_indices(len(m.idx))] = 0 m.matrix += m.matrix.T m.matrix[np.diag_indices(len(m.idx))] = t return m