Source code for alab.matrix

#!/usr/bin/env python

# Copyright (C) 2015 University of Southern California and
#                          Nan Hua
# 
# Authors: Nan Hua
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__author__  = "Nan Hua"
__credits__ = ["Nan Hua","Ke Gong","Harianto Tjong"]

__license__ = "GPL"
__version__ = "0.0.2"
__email__   = "nhua@usc.edu"

import numpy as np
import os.path
import re
import h5py
import copy
import cPickle
import warnings
from plots import plotxy, plotmatrix, histogram
import utils as alabutils

[docs]class contactmatrix(object):
    """
    A flexible matrix instant that supports various methods for processing HiC contacts
    
    Parameters
    ----------
    filename : str 
        matrix file stored in hdf5 format
        or an integer for the matrix size to initialize an empty matrix instance
    genome : str
        genome e.g.'hg19','mm9'
    resolution : int
        the resolution for the hic matrix e.g. 100000
    usechr : list
        containing the chromosomes used for generating the matrix
    
    Attributes
    ----------
    matrix : numpy 2d array 
        storing all infor for the hic contact matrix
    idx : numpy structure array 
        matrix index
    genome : str
        the genome
    resolution : int
        resolution for the contact matrix
    
    """
    _idxdtype = np.dtype([('chrom','S5'),('start',int),('end',int),('flag','S10')])
    def __init__(self,filename,genome=None,resolution=None,usechr=['#','X']):
        self._applyedMethods = {}
        if isinstance(filename,int):
            self.matrix=np.zeros((filename,filename),dtype = np.float32)
        elif isinstance(filename,str):
            if not os.path.isfile(filename):
                raise IOError,"File %s doesn't exist!\n" % (filename)
            if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext(filename)[1] == '.hmat':
                h5f = h5py.File(filename,'r')
                self.matrix = h5f['matrix'][:]
                self.idx    = h5f['idx'][:]
                if 'applyedMethods' in h5f.keys():
                    self._applyedMethods = cPickle.loads(h5f['applyedMethods'].value)
                
                if 'genome' in h5f.keys() and 'resolution' in h5f.keys():         
                    self.genome     = cPickle.loads(h5f['genome'].value)
                    self.resolution = cPickle.loads(h5f['resolution'].value)
                h5f.close()
            else:
                from alabio import loadstream
                f    = loadstream(filename)
                s    = f.next()
                line = re.split('\t+|\s+',s.rstrip())
                n    = len(line) - 3
                expectn = n
                if isinstance(genome,str) and isinstance(resolution,int):
                    genomedb    = alabutils.genome(genome,usechr=usechr)
                    bininfo     = genomedb.bininfo(resolution)
                    expectn     = len(bininfo.chromList)
                if expectn != n:
                    raise RuntimeError, "Dimension don't match, expected %s bins , get %s bins. Please check the input." %(expectn,n)
                idx  = []
                i    = 0
                tidx = line[0:3];tidx.append('')
                idx.append(tidx)
                self.matrix = np.zeros((n,n),dtype = np.float32)
                self.matrix[i] = line[3:]
                for s in f:
                    i += 1
                    line = re.split('\t+|\s+',s.rstrip())
                    tidx = line[0:3];tidx.append('')
                    idx.append(tidx)
                    self.matrix[i] = line[3:]
                f.close()
                self.idx    = np.core.records.fromarrays(np.array(idx).transpose(),dtype=self._idxdtype)
        else:
			raise RuntimeError, "Undefined input filename type!\n"
        #----------------end filename
        
        if isinstance(genome,str) and isinstance(resolution,int):
            if hasattr(self,"genome") and hasattr(self,"resolution"):
                raise RuntimeError, "Genome and resolution has already been specified."
            genomedb    = alabutils.genome(genome,usechr=usechr)
            bininfo     = genomedb.bininfo(resolution)
            flaglist    = ['' for i in range(len(bininfo.chromList))]
            self.genome = genome
            self.resolution = resolution
            self._buildindex(bininfo.chromList,bininfo.startList,bininfo.endList,flaglist)
      
    #==================================================
    def _buildindex(self,chromlist,startlist,endlist,flaglist):
        idxlist = np.column_stack([chromlist,startlist,endlist,flaglist])
        self.idx = np.core.records.fromarrays(np.array(idxlist).transpose(),dtype=self._idxdtype)
[docs]    def buildindex(self,**kwargs):
        warnings.warn("buildindex is deprecated, specify genome and resolution instead of building index manually.", DeprecationWarning)
        self._buildindex(**kwargs)
    #--------------------------------------------------
    def __str__(self):
        return self.matrix.__str__()
    def __repr__(self):
        return self.matrix.__repr__()
    def __len__(self):
        return self.matrix.__len__()

[docs]    def rowsum(self):
        return self.matrix.sum(axis=1)
[docs]    def columnsum(self):
        return self.matrix.sum(axis=0)
[docs]    def applyed(self,method):
        if method in self._applyedMethods:
            return True
        else:
            return False
    
    def _getZeroEntry(self):
        self.mask   = np.flatnonzero(self.rowsum() == 0)
    
    def _getMask(self,mask = None):
        if mask is None:
            self._getZeroEntry()
            return 0
        else:
            if isinstance(mask,np.ndarray):
                self.mask = mask
                return 1
            else:
                raise TypeError, "Invalid argument type, numpy.ndarray is required"
  
    def __checkGenomeResolution(self,genome,resolution):
        if hasattr(self,"genome") and hasattr(self,"resolution"):
            pass
        else:
            warnings.warn("No genome and resolution is specified within the file, try to assign attributes.")
            if (genome is None) or (resolution is None):
                raise ValueError, "No genome info is found! Genome and resolution parameter must be specified."
            else:
                self.genome = genome
                self.resolution = resolution

    #=========================================================filtering methods
[docs]    def removeDiagonal(self,force = False):
        if (not self.applyed('removeDiagonal')) or force:
            np.fill_diagonal(self.matrix,0)
            self._applyedMethods['removeDiagonal'] = True
        else:
            warnings.warn("Method removeDiagonal was done before, use force = True to overwrite it.")
      
[docs]    def removePoorRegions(self, cutoff=1, usepvalue = 0.1, force = False):
        """Removes "cutoff" percent of bins with least counts

        Parameters
        ----------
        cutoff : int, 0<cutoff<100
            Percent of lowest-counts bins to be removed
        usepvalue: float, 0<usepvalue<1
            use this pvalue as correlation cutoff to remove bins
            bins whose pvalue greater than this cutoff will be removed
        """
        if (not self.applyed('removePoorRegions')) or force:
            rowsum   = self.rowsum()
            mask     = np.flatnonzero(rowsum < np.percentile(rowsum[rowsum > 0],cutoff))
            #use correlation pvalue
            from scipy.stats import pearsonr,entropy
            newmask = []
            for i in mask:
                if rowsum[i] == 0:
                    continue
                split = alabutils.binomialSplit(self.matrix[i])
                corr,pvalue  = pearsonr(split[0],split[1])#returns corr
                rowentropy   = entropy(self.matrix[i])
                if pvalue > usepvalue:
                    newmask.append(i)
                    print i,rowsum[i],(corr,pvalue),"Remove",rowentropy
                else:
                    print i,rowsum[i],(corr,pvalue),"Keep",rowentropy
            self.matrix[newmask,:]    = 0
            self.matrix[:,newmask]    = 0
            self.idx['flag'][newmask] = 'Removed'
            print "%d low converage bins were removed." % (len(newmask))
            self._applyedMethods['removePoorRegions'] = (cutoff,len(newmask))
        else:
            warnings.warn("Method removePoorRegions was done before, use force = True to overwrite it.")
      
[docs]    def identifyInterOutliersCutoff(self,N=100):
        """
        Identify interchromosome outliers' cutoff
        Do an N round random choice as the original contact freq distribution and estimate the sample std for every contact freq
        If the sample std is larger than half of the frequency (contact #), lable this contact frequency as spourious
        cutoff is set to the number that first consecutive 2 non-spourious frequency from the right side (scan from high frequency to low)
        """
        if self.applyed('normalization'):
            raise RuntimeError, "Matrix is already normalized, raw matrix is needed."
        intermask = self.idx['chrom'][:,None] < self.idx['chrom'][None,:]
        interflatten = self.matrix[intermask]
        interflatten = interflatten[interflatten > 0]
        originHist = np.histogram(interflatten,int(interflatten.max()))[0]
        repeatResults = np.zeros((N,int(interflatten.max())),dtype=int)
        for i in range(N):
            tmpChoice = np.random.choice(interflatten,len(interflatten))
            repeatResults[i] = alabutils.listadd(repeatResults[i],np.histogram(tmpChoice,int(tmpChoice.max()))[0])
      
        comparison = np.std(repeatResults,axis=0) >= originHist/2
        i = len(comparison) -1
        while (comparison[i] or comparison[i-1]):i -= 1
        cutoff = i+1
        return cutoff
    
[docs]    def smoothInterContactByCutoff(self,cutoff,w=3,s=3,p=3,force=False):
        """
        given the cutoff, run a power law smoothing for the interchromosome matrix for contacts > cutoff
        """
        if (not self.applyed('smoothByCutoff')) or force:
            intermask = self.idx['chrom'][:,None] < self.idx['chrom'][None,:]
            x, y = np.nonzero(intermask)
            pos  = np.flatnonzero(self.matrix[intermask] > cutoff)
            for i in pos:
                cnew = alabutils.powerLawSmooth(self.matrix[ x[i]-w:x[i]+w+1 , y[i]-w:y[i]+w+1 ],(w,w),w,s,p)
                print x[i],y[i],self.matrix[x[i],y[i]],'-->',cnew
                self.matrix[x[i],y[i]] = cnew
                self.matrix[y[i],x[i]] = cnew
            self._applyedMethods['smoothByCutoff'] = cutoff
        else:
            warnings.warn("Method smoothInterContactByCutoff was done before,cutoff = %d use force = True to overwrite it."\
                            %(self._applyedMethods['smoothByCutoff']))
        
        
    #========================================================normalization methods
[docs]    def krnorm(self,mask = None,force=False,**kwargs):
        """
        using krnorm balacing the matrix (overwriting the matrix!)
        
        Parameters
        ----------
        
        mask : list/array 
            mask is a 1-D vector with the same length as the matrix where 1s specify the row/column to be ignored
            or a 1-D vector specifing the indexes of row/column to be ignored
            if no mask is given, row/column with rowsum==0 will be automatically detected and ignored
        large_mem : bool
            when large_mem is set to 1, matrix product is calculated using small chunks, 
            but this will slowdown the process a little bit.     
        """
        if (not self.applyed('normalization')) or force:
            from norm import bnewt
            self._getMask(mask)
            x = bnewt(self.matrix,mask=self.mask,check=0,**kwargs)*100
            self.matrix *= x 
            self.matrix *= x.T
            self._applyedMethods['normalization'] = 'krnorm'
        else:
            warnings.warn("Method %s was done before, use force = True to overwrite it." % (self._applyedMethods['normalization']))
  
[docs]    def vcnorm(self,iterations=1,mask = None,force=False):
        if (not self.applyed('normalization')) or force:
            self._getMask(mask)
            for i in range(iterations):
                print "\tIterations:",i+1
                rowsum   = self.rowsum()
                rowsum[self.mask] = 0
                totalsum = rowsum.sum()
                np.seterr(divide='ignore')
                rowsum   = 1/rowsum
                rowsum[self.mask] = 0
                self.matrix *= totalsum
                self.matrix *= rowsum 
                self.matrix *= rowsum.T
            self._applyedMethods['normalization'] = 'vcnorm'
        else:
            warnings.warn("Method %s was done before, use force = True to overwrite it." % (self._applyedMethods['normalization']))
    
    #-----------------------------------------------------------------------------
    
[docs]    def scale(self, cellaverage = 1):
        """
        Scale matrix so that average of cells is the given value. 
        By default, the rowsum will be the number of rows/columns
        """
        rowsum = self.rowsum()
        totalsum = rowsum.sum()
        try:
            self.mask
        except AttributeError:
            self._getMask()
        self.matrix = self.matrix / totalsum * (cellaverage * (len(rowsum)-len(self.mask)) * (len(rowsum)-len(self.mask)))
  
[docs]    def range(self,chrom):
        """
        return the index range for a give chromsome
        """
        rangeList = np.flatnonzero(self.idx['chrom'] == chrom)
        if len(rangeList)==0:
            raise ValueError, "%s is not found in the index" %(chrom)
        else:
            return (rangeList[0],rangeList[-1]+1)
  
[docs]    def makeIntraMatrix(self,chrom):
        """
        substract a chromsome matrix given a chromsome name
        
        Parameters
        ----------
        chrom : str, chromosome name e.g 'chr1'
        """
        if self.applyed('subMatrix'):
            warnings.warn("This is already a submatrix!")
        try:
            rstart,rend = self.range(chrom)
        except ValueError:
            raise ValueError, "%s is not found in the index. Possibly you are not using the genome wide matrix" %(chrom)
        submatrix   = contactmatrix(rend - rstart)
        submatrix.matrix = self.matrix[rstart:rend,rstart:rend]
        submatrix.idx    = np.core.records.fromrecords(self.idx[rstart:rend],dtype=self._idxdtype)
    
        if hasattr(self,"genome") and hasattr(self,"resolution"):
            submatrix.genome     = self.genome
            submatrix.resolution = self.resolution
        else:
            warnings.warn("No genome and resolution is specified, attributes are recommended for matrix.")
      
        submatrix._applyedMethods = copy.deepcopy(self._applyedMethods)
        submatrix._applyedMethods['subMatrix'] = chrom
        return submatrix
    
[docs]    def getICP(self,index):
        """
        return inter-chromosomal proportion of a given bin index
        """
        chrom = self.idx[index]['chrom']
        cstart,cend = self.range(chrom)
        intrasum = sum(self.matrix[index][cstart:cend])
        totalsum = sum(self.matrix[index])
        return 1-intrasum*1.0/totalsum
    
    #----------------------------------------------------------genome wide smoothing stuff:
[docs]    def smoothGenomeWideHighValue(self,w=3,s=3,p=3,z=5,force=False):
        """
        Use power law smoothing function to smooth high spikes in chromosomes blocks
        
        Parameters
        ----------
        w : int 
            the window size, the smoothing is computed using target +/- w
        s : int 
            weight of the location deviation
        p : int
            power of the location deviation
        z : int 
            range of standard deviation to set cutoff
        """
        if self.applyed('subMatrix'):
            raise RuntimeError, "This is a submatrix, genome wide smoothing cannot be applyed."
        if (not self.applyed('smoothGenomeWide')) or force:
            smoothed = 0
            chrlist = np.unique(self.idx['chrom'])
            if sum(np.diagonal(self.matrix)) < 1:
                v0 = np.diagonal(self.matrix,1)
                v1 = np.append(v0[0],v0)
                v2 = np.append(v0,v0[-1])
                np.fill_diagonal(self.matrix,v1+v2)
                #replace 0 with large number so most likely the neighborhood of diagonal are not smoothed
            for row in range(len(chrlist)):
                rstart,rend = self.range(chrlist[row])
                for column in range(row,len(chrlist)):
                    cstart,cend           = self.range(chrlist[column])
                    print "Smoothing block (%s,%s)" % (chrlist[row],chrlist[column])
                    tmpMatrix,smoothedCounts = alabutils.smoothSpikesInBlock(self.matrix[rstart:rend,cstart:cend],w,s,p,z)
                    self.matrix[rstart:rend,cstart:cend] = tmpMatrix
                    self.matrix[cstart:cend,rstart:rend] = tmpMatrix.T
                    if row == column:
                        smoothed += smoothedCounts
                    else:
                        smoothed += 2*smoothedCounts
            self._applyedMethods['smoothGenomeWide'] = (smoothed,"w=%d,s=%d,p=%d,z=%d" % (w,s,p,z))
            print "Genomewide smoothing finished, %d contacts smoothed" % (smoothed)
        else:
            warnings.warn("Method smoothGenomeWideHighValue was done before, %s %d values smoothed. use force = True to overwrite it."\
                            %(self._applyedMethods['smoothGenomeWide'][1],self._applyedMethods['smoothGenomeWide'][0]))
    #==============================================================Probabiliy matrix methods
[docs]    def getDomainMatrix(self,domainChrom,domainStartPos,domainEndPos,rowmask,minSize=1,maxSize=None):
        """
        Return a submatrix defined by domainChrom, domainStartPos, domainEndPos
        
        Parameters
        ----------
        domainChrom : str
            domain chromosome e.g. 'chr1'
        domainStartPos : int 
            start position e.g. 0
        domainEndPos : int 
            end position e.g. 700000
        minSize : int, > 0
            min domain size
        maxSize : int, optional
            max domain size, in bins
            if the domain is larger than a given number of bins, this function will return None
        """
        chrStartBin,chrEndBin = self.range(domainChrom)
        domainStartBin  = chrStartBin + int( domainStartPos/ float(self.resolution) )
        domainEndBin    = chrStartBin + int( np.ceil( domainEndPos/ float(self.resolution) ) )
        newmatrix = None
        if (domainEndBin - domainStartBin) >= minSize:
            if maxSize is None:
                newmatrix = self.matrix[domainStartBin:domainEndBin, domainStartBin:domainEndBin]
            elif (domainEndBin - domainStartBin) <= maxSize:
                newmatrix = self.matrix[domainStartBin:domainEndBin, domainStartBin:domainEndBin]
            else:
                return None
        else:
            return None
        maskloc = np.intersect1d(range(domainStartBin,domainEndBin),rowmask)
        maskloc = maskloc - domainStartBin
        newmatrix = np.delete(np.delete(newmatrix,maskloc,axis=0),maskloc,axis=1)
        return newmatrix
  
[docs]    def getfmax(self,method = 'UF',minSize=1,maxSize=2000,removeZero=False,boxplotTrim=False,offdiag=1,target='median'):
        """
        calculate fmax based on different methods
        
        Parameters
        ----------
        
        method : str
            NM #neighbouring max
            UF #uniform fmax
        target : str
            'mean'/'median'
        """
        if self.applyed('subMatrix'):
            raise RuntimeError, "This is a submatrix, genome wide fmax cannot be applyed."
        if self.applyed('probabilityMatrix'):
            raise RuntimeError, "This is already a probability matrix!"

        fmax = None
    
        if method == 'NM':#method neighbour max
            fmax = np.zeros(len(self))
            for chrom in np.unique(self.idx['chrom']):
                cstart, cend = self.range(chrom) #cend is increased by one
                for i in range(cstart+1,cend-1):
                    fmax[i] = min(self.matrix[i,i+1],self.matrix[i,i-1])
                fmax[cstart] = self.matrix[cstart,cstart+1] #p telomere
                fmax[cend-1] = self.matrix[cend-1,cend-2] #q telomere
        elif method == 'UF':#method uniform fmax
            if not hasattr(self,"domainIdx"):
                raise RuntimeError, "Please use assignDomain(domain_bedgraph,pattern) to assign domain INFO"
            print "Using minSize = %d, eliminating domains smaller than %dkb." % (minSize,minSize*self.resolution/1000)
            print "Using maxSize = %d, eliminating domains larger than %dkb." % (maxSize,maxSize*self.resolution/1000)
      
            #Get all intra domain interactions (upper triangle)
            upperTriangle = []
            skipDomains = 0
            print "Including Off Diagonal %d" %(offdiag)
            
            rowmask = np.flatnonzero(self.rowsum() == 0) #removed bins
            for domainRec in self.domainIdx:
                domainMatrix = self.getDomainMatrix(domainRec['chrom'],domainRec['start'],domainRec['end'],rowmask,minSize,maxSize)#domain matrix, eliminating domains that are larger than 20 bins
                if domainMatrix is None:
                    skipDomains += 1
                    continue
                upperTriangle.extend(domainMatrix[np.triu_indices(len(domainMatrix),offdiag)])#get the upper triangle
            #--------scaning finished
            print "%d domains are scanned, %d domains are eliminated." % (len(self.domainIdx),skipDomains)
            upperTriangle = np.array(upperTriangle)
            if removeZero:
                print "Removing zeros"
                upperTriangle = upperTriangle[upperTriangle > 0]
            lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(upperTriangle)#get quartiles and fence
            print lowerFence,Q1,Q2,Q3,upperFence
            if boxplotTrim:
                print "Trimming outliers"
                upperTriangle = upperTriangle[(upperTriangle > lowerFence) & (upperTriangle < upperFence)] #trim to better range
                lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(upperTriangle)#get quartiles and fence
                print lowerFence,Q1,Q2,Q3,upperFence
            if target == "median":
                fmax = Q2#get median
            elif target == "mean":
                fmax = upperTriangle.mean()#get mean
            else:
                raise RuntimeError, "target take only 'median' or 'mean' method"
        else:
            raise RuntimeError, "Please use legal method parameters:'NM' or 'UF'!"
        return fmax
    
    #-----------------------use fmax to get prob matrix
[docs]    def fmaximization(self,**kwargs):
        warnings.warn("fmaximization is deprecated, function name changed to fmaxScaling.", DeprecationWarning)
        self.fmaxScaling(**kwargs)
        
[docs]    def fmaxScaling(self,fmax,force=False):
        """
        use fmax to generate probability matrix
        for uniform fmax, simply divide the matrix by fmax and clip to 1
        for neighbouring contact fmax
        P[i,j] = F[i,j]/min(fmax[i],fmax[j])
        """
        if self.applyed('probabilityMatrix') and (not force):
            raise RuntimeError, "This is already a probability matrix!,use force to overwrite"
        if isinstance(fmax,float) or isinstance(fmax,np.float32) or isinstance(fmax,int):
            print "Uniform fmax detected"
            self.matrix = self.matrix/fmax
            self.matrix = self.matrix.clip(max=1)
            self._applyedMethods['probabilityMatrix'] = 'Uniform Fmax=%f' % (fmax)
        else:
            raise AttributeError, "Not supported fmax type!"
    
[docs]    def assignDomain(self,domain,pattern=''):
        """
        Load Domain information
        
        Parameters
        ----------
        
        domain : alab.files.bedgraph instance
            bedgraph for domain definition
        pattern : str 
            a string use to filter the flags in the bedgraph
        """
        from files import bedgraph
        if not isinstance(domain,bedgraph):
            raise TypeError,"Bedgraph instance required, see alab.files.bedgraph for more details"
        self.domainIdx = domain.filter(pattern)
    
    def _generateMedianSummaryMatrix(self,summaryBinStart,summaryBinEnd):
        N = len(summaryBinStart)
        X = np.empty((N,N),np.float32)
        for i in range(N):
            print "Filling X[%d] from A[%d] to A[%d]" % (i,summaryBinStart[i],summaryBinEnd[i]-1)
            istart = int(summaryBinStart[i])
            iend   = int(summaryBinEnd[i])
            for j in range(i,N):
                #print "Filling X[%d] from A[:,%d] to A[:,%d]" % (i,summaryBinStart[j],summaryBinEnd[j]-1)
                jstart = int(summaryBinStart[j])
                jend   = int(summaryBinEnd[j])
                submatrix = self.matrix[istart:iend,jstart:jend]
                #making sure that empty bins are removed
                out = np.nanmedian(submatrix)
                if np.isnan(out):
                    out = 0
                X[i,j] = X[j,i] = out
        return X
        
    def _generateTopMeanSummaryMatrix(self,summaryBinStart,summaryBinEnd,top=10,removeOutlier=True):
        N = len(summaryBinStart)
        X = np.empty((N,N),np.float32)
        for i in range(N):
            print "Filling X[%d] from A[%d] to A[%d]" % (i,summaryBinStart[i],summaryBinEnd[i]-1)
            istart = int(summaryBinStart[i])
            iend   = int(summaryBinEnd[i])
            for j in range(i,N):
                jstart = int(summaryBinStart[j])
                jend   = int(summaryBinEnd[j])
                submatrix = self.matrix[istart:iend,jstart:jend].flatten()
                submatrix = submatrix[~np.isnan(submatrix)] #get all non-nan value
                if len(submatrix) < 1:
                    out = 0
                else:
                    if removeOutlier:
                        lowerFence,Q1,Q2,Q3,upperFence = alabutils.boxplotStats(submatrix)
                        submatrix = submatrix[(submatrix>=lowerFence) & (submatrix<=upperFence)]
                    else:
                        pass
                    bound = np.percentile(submatrix,100-top)
                    out = np.mean(submatrix[submatrix>=bound])
                X[i,j] = X[j,i] = out
        return X
    
[docs]    def makeDomainLevelMatrix(self,method='topmean',top=10,removeOutlier=True):
        """
        Use domain INFO to generate Domain level matrix
        
        Parameters
        ----------
        
        method : str
            "topmean" or "median"
        top : int 0<top<100
            the top percentage to calculate the mean, top=10 means top 10% of the subdomain matrix
        removeOutlier : bool
            option to remove outlier using 1.5IQR
        """
        if self.applyed('domainLevel'):
            raise RuntimeError, "This is already a domain level matrix!"
        if not hasattr(self,"domainIdx"):
            raise RuntimeError, "Please use assignDomain(domain_bedgraph,pattern) to assign domain INFO"
            
        domainLevelMatrix = contactmatrix(len(self.domainIdx))
        domainLevelMatrix._buildindex(self.domainIdx['chrom'],self.domainIdx['start'],self.domainIdx['end'],self.domainIdx['flag'])
        domainLevelMatrix.genome = self.genome
        domainLevelMatrix.resolution = self.resolution
        domainLevelMatrix._applyedMethods = copy.deepcopy(self._applyedMethods)
        
        summaryBinStart = np.zeros(len(self.domainIdx))
        summaryBinEnd   = np.zeros(len(self.domainIdx))
        for i in range(len(self.domainIdx)):
            chrStartBin,chrEndBin = self.range(self.domainIdx[i]['chrom'])
            summaryBinStart[i]    = chrStartBin + int(self.domainIdx[i]['start'] / float(self.resolution))
            summaryBinEnd[i]      = chrStartBin + int(np.ceil(self.domainIdx[i]['end'] / float(self.resolution)))
        
        self._getMask()#removed bins in self.mask
        self.matrix[self.mask,:] = np.nan
        self.matrix[:,self.mask] = np.nan
        if method == 'topmean':
            domainLevelMatrix.matrix = self._generateTopMeanSummaryMatrix(summaryBinStart,summaryBinEnd,top,removeOutlier)
            domainLevelMatrix._applyedMethods['domainLevel'] = "%s/top=%d%s" % (method,top,'%')
        elif method == 'median':
            domainLevelMatrix.matrix = self._generateMedianSummaryMatrix(summaryBinStart,summaryBinEnd)
            domainLevelMatrix._applyedMethods['domainLevel'] = "%s" % (method)
        
        return domainLevelMatrix
    
[docs]    def iterativeFmaxScaling(self,domainAverageContacts=23.2,tol=0.01):
        """
        Automatic fmax scaling to get domain level matrix and match the rowsum average domain level matrix to 
        domainAverageContacts
        """
        fmax = self.getfmax()
        domainMean = 0
        originMatrix = copy.deepcopy(self.matrix)
        while abs(domainMean - domainAverageContacts)/domainAverageContacts > tol:
            print "fmax=%f"%(fmax)
            self.matrix = copy.deepcopy(originMatrix)
            self.fmaxScaling(fmax,force=True)
            domainLevelMatrix = self.makeDomainLevelMatrix()
            domainMean = domainLevelMatrix.rowsum().mean()
            fmax = fmax/domainAverageContacts*domainMean
        return domainLevelMatrix
        
    #==============================================================plotting methods
[docs]    def plot(self,figurename,log=False,**kwargs):
        """
        plot the matrix heat map
            
        Parameters
        ----------
        figurename : str
        log : bool
            if True, plot the log scale of the matrix
            if False, plot the original matrix
        clip_max : float
        clip_min : float
            2 options that will clip the matrix to certain value
        cmap : matplotlib.cm instance
            color map of the matrix
        label : str
            label of the figure
        """
        if log:
            plotmatrix(figurename,np.log(self.matrix),**kwargs)
        else:
            plotmatrix(figurename,self.matrix,**kwargs)
        
[docs]    def plotZeroCount(self,figurename,**kwargs):
        zeroCount = []
        for i in range(len(self.matrix)):
            zeros = len(np.flatnonzero(self.matrix[i] == 0))
            if zeros != len(self.matrix):
                zeroCount.append(zeros)
        #--endfor
        histogram(figurename,
                zeroCount,
                int(len(self.matrix)/100),
                xlab = '# of Zeros', ylab = 'Frequency',
                **kwargs)
        
    
[docs]    def plotSum(self,figurename,outlier=False,line=None,**kwargs):
        """
        Print the rowsum frequency histogram
        
        Parameters
        ----------
        figurename : string
            Name of the plot
        outlier: bool
            option to select plotting the outlier line, only functioning if 'line' parameter is set to None
        line : float/array/list
            draw vertical lines at a list of positions 
        """
        rowsum = self.rowsum()
        if line is None:
            if outlier:
                line = (np.percentile(rowsum,75) - np.percentile(rowsum,25))*1.5 + np.percentile(rowsum,75)
        
        histogram(figurename,
                rowsum[rowsum > 0],
                int(len(self.matrix)/100),
                xlab = 'Row sums', ylab = 'Frequency',
                line = line,
                **kwargs)
        
    #==============================================================save method
[docs]    def save(self, filename):
        """
            Save the matrix along with information in hdf5 file
        """
        if (filename[-5:] != '.hmat'):
            filename += '.hmat'
        h5f = h5py.File(filename, 'w')
        h5f.create_dataset('matrix', data=self.matrix, compression = 'gzip', compression_opts=9)
        h5f.create_dataset('idx', data=self.idx, compression = 'gzip', compression_opts=9)
        h5f.create_dataset('applyedMethods', data=cPickle.dumps(self._applyedMethods))
        if hasattr(self,"genome") and hasattr(self,"resolution"):
            h5f.create_dataset('genome',data = cPickle.dumps(self.genome))
            h5f.create_dataset('resolution',data = cPickle.dumps(self.resolution))
        else:
            warnings.warn("No genome and resolution is specified, attributes are recommended for matrix.")
        
        h5f.close()
#----------------------------------End of Class contactmatrix-------------------------------------------------
#==============================================================================================================

def loadh5dict(filename):
    h5f    = h5py.File(filename,'r')
    genome           = cPickle.loads(h5f['genome'].value)
    resolution       = cPickle.loads(h5f['resolution'].value)
    #genomeIdxToLabel = cPickle.loads(h5f['genomeIdxToLabel'].value)
    binNumber        = cPickle.loads(h5f['binNumber'].value)
    newMatrix = contactmatrix(binNumber,genome,resolution)
    newMatrix.matrix[:] = h5f['heatmap'][:]
    return newMatrix

def loadhic(filename,genome='hg19',resolution=100000,usechr=['#','X'],verbose=False):
    from . import straw
    
    tgenome = alabutils.genome(genome)
    bininfo = tgenome.bininfo(resolution)

    m = contactmatrix(len(bininfo.chromList),genome=genome,resolution=resolution,usechr=usechr)
    for chr1 in tgenome.info['chrom']:
        i = tgenome.getchrnum(chr1)
        for chr2 in tgenome.info['chrom']:
            j = tgenome.getchrnum(chr2)
            if i > j:
                continue
            if verbose:
                print chr1,chr2
            
            result = straw.straw("NONE",filename,chr1[3:],chr2[3:],'BP',resolution)
            for t in range(len(result[0])):
                x = int(result[0][t]/resolution) + bininfo.binStart[i]
                y = int(result[1][t]/resolution) + bininfo.binStart[j]
                m.matrix[x,y] = result[2][t]
                m.matrix[y,x] = result[2][t]
            #-
        #--
    #--
    
    return m

def loadcooler(filename,usechr=['#','X'],verbose=False):
    import scipy.sparse
    
    h5         = h5py.File(filename,'r')
    genome     = str(h5.attrs['genome-assembly'])
    resolution = h5.attrs['bin-size']
    nbins      = h5.attrs['nbins']
    
    if verbose:
        print genome, resolution
    
    tgenome = utils.genome(genome)
    bininfo = tgenome.bininfo(resolution)
    sp = scipy.sparse.csr_matrix((h5['pixels']['count'],h5['pixels']['bin2_id'],h5['indexes']['bin1_offset']),shape=(nbins,nbins))
   
    m = contactmatrix(len(bininfo.chromList),genome=genome,resolution=resolution,usechr=usechr)
    m.matrix = sp.toarray()[:len(m.idx),:len(m.idx)].astype(np.float32)
    h5.close()
    
    t = m.matrix.diagonal().copy()
    m.matrix[np.diag_indices(len(m.idx))] = 0
    m.matrix += m.matrix.T
    m.matrix[np.diag_indices(len(m.idx))] = t
    
    return m