test_clustering.py

#!/usr/bin/env python

import numpy as np
import scipy.cluster.hierarchy as sch
from sklearn.cluster import ward_tree, AffinityPropagation, MeanShift, DBSCAN, Birch, KMeans
import ete3, gzip, os, resource, sys
#from array2scaffolds import load_matrix, logger, transform
from collections import Counter
from datetime import datetime
import fastcluster
from bam2clusters import bam2clusters
from FastaIndex import FastaIndex

def logger(message, log=sys.stdout):
    """Log messages"""
    memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    log.write("[%s] %s    [memory: %6i Mb]\n"%(datetime.ctime(datetime.now()), message, memory))

# update sys.path & environmental PATH
root = '/home/lpryszcz/src/HiCembler' #os.path.dirname(os.path.abspath(sys.argv[0]))
src = ["bin", "bin/snap", "bin/sinkhorn_knopp"]
paths = [os.path.join(root, p) for p in src]
sys.path = paths + sys.path
os.environ["PATH"] = "%s:%s"%(':'.join(paths), os.environ["PATH"])

from sinkhorn_knopp import sinkhorn_knopp

def normalize(d, bin_chr, bin_position, max_iter=1000, epsilon=0.0001, windowSize=1000.):
    """Return symmetric and fully balanced matrix using SinkhornKnopp"""
    print "full sk balancing * dmax"
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal()/2)
    # full balancing
    sk = sinkhorn_knopp.SinkhornKnopp(max_iter=max_iter, epsilon=epsilon); 
    d += 1; 
    d /= d.max(); 
    d = sk.fit(d) * d.max()#* 100000
    # 1 round balancing
    #sk = sinkhorn_knopp.SinkhornKnopp(max_iter=1); d += 1; d /= d.max(); d = sk.fit(d)
    '''
    axis = 1; d *= 1. * d.sum(axis=axis).max() / d.sum(axis=axis); print "axis %s norm"%axis #normalize_rows(d)
    ''' 
    return d, bin_chr, bin_position

def normalize_diagonal(d, bin_chr, bin_position):
    """Return symmetric and fully balanced matrix using SinkhornKnopp"""
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal()/2)
    # diagonal mean normalisation
    indices = d.diagonal()!=0; print "diag norm 3"
    d = d[indices, :]
    d = d[:, indices]
    bin_chr = bin_chr[indices]
    bin_position = bin_position[indices, :]    
    n2 = np.mean(d.diagonal()) / d.diagonal()
    d = (d*n2).T*n2 
    return d, bin_chr, bin_position

def normalize_average(d, bin_chr, bin_position):
    """Return symmetric and fully balanced matrix using SinkhornKnopp"""
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal()/2)
    # diagonal mean normalisation
    print "average"
    n = d.sum(axis=0).max() / d.sum(axis=0)
    d = (d*n).T*n #/ d.max() *1000
    return d, bin_chr, bin_position

def normalize_window_size(d, bin_chr, bin_position, windowSize=1e4):
    """Return symmetric and normalised matrix by window size"""
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal()/2)
    # normalize by windows size
    sizes = np.diff(bin_position, axis=1)#[:, 0]
    if not windowSize:
        c = Counter(sizes.reshape(len(sizes)))
        windowSize, occurencies = c.most_common(1)[0]; print windowSize, occurencies
    d = windowSize**2 * (d / sizes ).T / sizes
    return d, bin_chr, bin_position

def normalize_rows(a):
    """Normalise rows so the sums among rows are identical."""
    rows, cols = a.shape
    maxv = a.sum(axis=0).max()
    for i in xrange(rows):
        # only if any signal
        if a[i].max():
            a[i] *= 1.*maxv/a[i].sum()
    return a

def get_contig2size(bin_chr, bin_position):
    """Return contig2size"""
    # calculate genome size and contig2size
    contig2size = {get_name(c): 0 for c in np.unique(bin_chr)}
    for c, (s, e) in zip(bin_chr, bin_position):
        contig2size[get_name(c)] += e-s
    return contig2size

def load_matrix(fname, chrs=[], remove_shorter=True, scaffolds=[], verbose=0, remove_nans=1, remove_zeros=1):
    """Load Hi-C interaction matrix from numpy dump
    generated by fastq2array.py. 
     
    Returns:
    d: data matrix over the selected set of chromosomes.
    bin_chr: list of chr index assignment of each bin.
    bin_position: start and end position of each bin
    """
    if scaffolds:
        remove_shorter = True
        
    # load array
    npy = np.load(fname)
    d = npy[npy.files[0]]
    
    # load windows
    windowfn = fname[:-4]+'.windows.tab.gz' 
    bin_chr = []
    bin_position = [] 
    for i, l in enumerate(gzip.open(windowfn)):
        chrom, start, end = l[:-1].split('\t')
        bin_chr.append(chrom)
        bin_position.append(map(int, (start, end)))
        
    # chromosome array
    bin_position = np.array(bin_position)
    bin_chr = np.array(bin_chr)
    contig2size = get_contig2size(bin_chr, bin_position)
    
    # eliminate nanas
    if remove_nans:
        indices = ~(np.sum(np.isnan(d), 0) == d.shape[0])
        if indices.sum() < d.shape[0]:
            print "remove_nans:", indices.sum(), d.shape
            d = d[indices, :]
            d = d[:, indices]
            bin_chr = bin_chr[indices]
            bin_position = bin_position[indices, :]
            
    if remove_zeros:
        indices = ~(np.any((np.sum(d, axis=0)==0, np.sum(d, axis=1)==0), axis=0))
        if indices.sum() < d.shape[0]:
            print "removed rows/columns summing to zero:", indices.sum(), d.shape
            d = d[indices, :]
            d = d[:, indices]
            bin_chr = bin_chr[indices]
            bin_position = bin_position[indices, :]
    
    #''' # eliminate 
    c = Counter(np.diff(bin_position, axis=1)[:, 0])
    windowSize, occurencies = c.most_common(1)[0]    
    if remove_shorter:
        if verbose:
            sys.stderr.write(" most common window: %s bp [%5.2f%s]\n"%(windowSize, occurencies*100./len(bin_chr), '%'))
        indices = ~(np.diff(bin_position, axis=1)[:, 0]!=windowSize)
        d = d[indices, :]
        d = d[:, indices]
        bin_chr = bin_chr[indices]
        bin_position = bin_position[indices, :]
    #'''
    else:
        # normalise by length
        sizenorm = np.array([1.0*windowSize/(e-s) for s, e in bin_position])
        d *= sizenorm #'''
   
    # keep only relevant chromosomes
    if chrs:
        indices = np.any(bin_chr[None].T == chrs, 1)
        d = d[indices, :]
        d = d[:, indices]
        bin_chr = bin_chr[indices]
        bin_position = bin_position[indices, :]

    # combine existing array using information from previous round of scaffolding
    if scaffolds: 
        contig2indices = get_contig2indices(bin_chr)
        indices, bin_chr, bin_position = [], [], []
        for i, scaffold in enumerate(scaffolds, 1):
            name = "scaffold%s"%i
            indices += get_indices(scaffold, contig2indices)
            bin_chr += [name]*len(indices)
            bin_position += [(s, s+windowSize) for s in range(0, windowSize*len(indices), windowSize)]
        # combine
        d = d[:, indices][indices, :]
        bin_chr = np.array(bin_chr)
        bin_position = np.array(bin_position)
        contig2size = get_contig2size(bin_chr, bin_position)
    
    #d, bin_chr, bin_position = normalize(d, bin_chr, bin_position)
    #d, bin_chr, bin_position = normalize_average(d, bin_chr, bin_position)
    #d = normalize_rows(d)
    #d, bin_chr, bin_position = normalize_window_size(d, bin_chr, bin_position)#, 2000)
    
    return d, bin_chr, bin_position, contig2size

def distance_matrix2tree(Z, names):
    """Return tree representation for distance matrix"""
    n = Z.shape[0]+1
    i2n = [0] * (2*n - 1)
    t = ete3.Tree()
    for i, (idx1, idx2, dist, sample_count) in enumerate(Z):
        idx1, idx2 = int(idx1), int(idx2)
        # create Tree object for tips / leaves
        if idx1 < n:
            i2n[idx1] = ete3.Tree(name=names[idx1])
        if idx2 < n:
            i2n[idx2] = ete3.Tree(name=names[idx2])
        # create new node
        t = ete3.Tree()
        # normalise distance
        dist1 = dist - i2n[idx1].get_farthest_leaf()[1]
        dist2 = dist - i2n[idx2].get_farthest_leaf()[1]
        # add children
        t.add_child(i2n[idx1], dist=dist1)
        t.add_child(i2n[idx2], dist=dist2)
        # store
        i2n[n + i] = t
    return t

def getNewick(node, newick, parentdist, leaf_names):
    if node.is_leaf():
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
    else:
        if len(newick) > 0:
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
        else:
            newick = ");"
        newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
        newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
        newick = "(%s" % (newick)
        return newick

def truncate(t, mind=3, maxd=0):
    for i, n in enumerate(t.traverse(), 1):
        dist = t.get_distance(n, topology_only=1)
        chrs = get_chromosome(n.get_leaf_names())
        if dist>mind and len(chrs)==1 or maxd and dist>maxd:
            n.leaves = n.get_leaf_names()
            n.chrs = chrs
            n.name="%s %s chrs %s leaves"%(chrs.most_common(1)[0][0], len(chrs), len(n))
            for _n in n.get_children():
                n.remove_child(_n)
    return t

def get_names(bin_chr, bin_position):
    return ["%s %s"%(get_name(c), s) for c, (s, e) in zip(bin_chr, bin_position)]

def get_name(contig):
    return contig.split()[0].split('|')[-1].split('|')[0]
    
def get_chr_name(n):
    return n.split()[0].split(".")[0]
    
def get_chromosome(names): return Counter(get_chr_name(n) for n in names)    

def get_longest(t, maxdist=6, k=2.0):
    """Return node having longest branch
    THIS CAN BE FASTER DEFINITELY!
    """
    #n = sorted(t.traverse(), key=lambda n: 2*n.dist-t.get_distance(n), reverse=1)[0]
    n = t
    bestdist = k*n.dist-n.get_distance(t)
    for _n in t.traverse():
        if _n.get_distance(t, topology_only=1) > maxdist:
            break
        if k*_n.dist-_n.get_distance(t) > bestdist:
            n = _n
            bestdist = k*_n.dist-_n.get_distance(t)
    return n, bestdist

def ward2tree(Z, distances, names):
    """Return tree representation for distance matrix"""
    n = Z.shape[0]+1
    i2n = [0] * (2*n - 1)
    t = ete3.Tree()
    for i, ((idx1, idx2), dist) in enumerate(zip(Z, distances)):
        idx1, idx2 = int(idx1), int(idx2)
        # create Tree object for tips / leaves
        if idx1 < n:
            i2n[idx1] = ete3.Tree(name=names[idx1])
        if idx2 < n:
            i2n[idx2] = ete3.Tree(name=names[idx2])
        # create new node
        t = ete3.Tree()
        # normalise distance
        dist1 = dist - i2n[idx1].get_farthest_leaf()[1]
        dist2 = dist - i2n[idx2].get_farthest_leaf()[1]
        # add children
        t.add_child(i2n[idx1], dist=dist1)
        t.add_child(i2n[idx2], dist=dist2)
        # store
        i2n[n + i] = t
    t.dist = 0
    return t

def get_subtrees_sklearn(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
    
    names = get_names(bin_chr, bin_position)
    #ap = Birch(n_clusters=15)#damping=0.5, max_iter=200, convergence_iter=15, affinity='euclidean') #euclidean precomputed
    ap = KMeans(n_clusters=10)
    assignements = ap.fit_predict(d)#; print assignements[:10]
    c = Counter(assignements); print c.most_common(5)
    subtrees = [[] for i in range(max(assignements)+1)]; print len(subtrees), max(assignements)
    for chrom, i in zip(names, assignements):
        subtrees[i].append(chrom)
    return subtrees
    
def get_subtrees(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
    """Return contings clustered into scaffolds
    fastcluster is slightly faster than scipy.cluster.hierarchy and solve the issue: http://stackoverflow.com/a/40738977/632242
    """
    maxtdist = 0
    i = 0
    subtrees = []
    names = get_names(bin_chr, bin_position)
    #Z = sch.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
    Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method) #euclidean sqeuclidean , metric='euclidean'
    #t = distance_matrix2tree(Z, names); print len(set(t.get_leaf_names())), len(names)
    tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names)); print len(set(t.get_leaf_names())), len(names)
    for i in range(1, nchrom):
        tname, tdist = t.get_farthest_leaf()#[1]
        if maxtdist < tdist:
            maxtdist = t.get_farthest_leaf()[1]
        # get longest branch
        n, bestdist = get_longest(t)
        # break if small subtree
        if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac:  #tdist < 1.* bestdist
            break
        
        pruned = n.get_leaf_names()
        subtrees.append(pruned)
        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)        
        t2 = truncate(ete3.Tree(t.write()), maxd=5); t2.render('tree_%s.pdf'%i) 
        
        # prune the tree
        ancestors = n.get_ancestors()
        p = ancestors[0]
        p.remove_child(n)
        n2 = p.get_children()[0]
        if len(ancestors)<2: #p.is_root(): #
            p.remove_child(n2)
            t = n2
            t.dist = 0
        else:
            p2 = ancestors[1]
            p2.remove_child(p)
            p2.add_child(n2, dist=n2.dist+p.dist)
        #'''
    if i:    
        subtrees.append(t.get_leaf_names())
        pruned = t.get_leaf_names()
        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)
        t2 = truncate(ete3.Tree(t.write()), maxd=5); t2.render('tree_%s.pdf'%i) 
    return subtrees
    
def get_subtrees0(d, bin_chr, bin_position, method="ward", nchrom=1000, distfrac=0.4):
    """Recomputing linkage is slower than pruning the tree for large matrices"""
    maxtdist = 0
    i = 0
    subtrees = []
    for i in range(1, nchrom):
        Z = fastcluster.linkage(d[np.triu_indices(d.shape[0], 1)], method=method)
        names = get_names(bin_chr, bin_position)
        t = distance_matrix2tree(Z, names)
        #tree = sch.to_tree(Z, False); t = ete3.Tree(getNewick(tree, "", tree.dist, names))
    
        # get longest branch
        n, bestdist = get_longest(t)
        tname, tdist = t.get_farthest_leaf()#[1]
        if maxtdist < tdist:
            maxtdist = t.get_farthest_leaf()[1]
        # break if small subtree
        if tdist / maxtdist < 1.1 * bestdist / tdist or tdist < maxtdist*distfrac:
            break
          
        pruned = n.get_leaf_names()         
        subtrees.append(pruned)

        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)        
        t = truncate(t, maxd=5); t.render('tree_%s.pdf'%i)
        # prune array        
        indices = np.array([False if name in set(pruned) else True for _i, name in enumerate(names)])
        d = d[indices, :]
        d = d[:, indices]
        bin_chr = bin_chr[indices]
        bin_position = bin_position[indices, :]
            
    if i:    
        subtrees.append(t.get_leaf_names())
        pruned = t.get_leaf_names()
        c = Counter(get_chr_name(n) for n in pruned)
        print i, len(names), tdist, maxtdist, bestdist, len(pruned), c.most_common(5)        
    return subtrees


def main(fn='/home/lpryszcz/cluster/hic/arath/_archives/snap/SRR2626163.100k.npz', method="ward"): #
    if len(sys.argv)>1:
        fn = sys.argv[1]
    if len(sys.argv)>2:
        method = sys.argv[2]
    d, bin_chr, bin_position, contig2size = load_matrix(fn, remove_shorter=0)
    logger(" Loaded matrix %s..."%(str(d.shape),))
    
    transform = lambda x: np.log(np.max(x+1))-np.log(x+1)
    d = transform(d)
    subtrees = get_subtrees(d, bin_chr, bin_position, method)
    
    logger(" Assigning contigs (in %s windows) to %s clusters..."%(sum(map(len, subtrees)), len(subtrees)))
    total = correct = 0
    contig2cluster = {get_name(c): Counter() for c in np.unique(bin_chr)}
    for i, subtree in enumerate(subtrees, 1):
        c = Counter(map(get_chr_name, subtree))
        total += len(subtree)
        correct += c.most_common(1)[0][1]#; print c.most_common(1)
        # poplate contig2clustre
        c2 = Counter(map(get_name, subtree))
        for k, v in c2.iteritems():
            if not k: continue
            contig2cluster[get_name(k)][i] += v
    logger("  %s / %s [%.2f%s]"%(correct, total, 100.*correct/total, '%'))

    logger(" Weak assignments...")
    clusters = [[] for i in range(len(subtrees)+1)]
    withoutCluster, weakCluster = [], []
    for c, counter in contig2cluster.iteritems():
        if not counter:
            withoutCluster.append(c)
            continue
        # get major cluster
        clusteri, count = counter.most_common(1)[0]#; print clusteri, count
        mfrac = 1. * count / sum(counter.itervalues())
        clusters[clusteri].append(c)
        if mfrac<.66:
            weakCluster.append(c)
    logger("  %s bp in %s contigs without assignment."%(sum(contig2size[c] for c in withoutCluster), len(withoutCluster)))
    logger("  %s bp in %s contigs having weak assignment."%(sum(contig2size[c] for c in weakCluster), len(weakCluster)))
      
    outfile = fn[:-4]+".clusters.tab"
    clusters = filter(lambda x: x, clusters)
    totsize = totwindows = 0
    logger("Reporting %s clusters to %s ..."%(len(clusters), outfile))
    with open(outfile, "w") as out:
        for i, cluster in enumerate(clusters, 1):
            clSize = sum(contig2size[c] for c in cluster)
            print " cluster_%s %s windows %s bp; %s"%(i, len(cluster), clSize, Counter(get_chromosome(cluster)).most_common(3))         
            totsize += clSize
            totwindows += len(cluster)
            out.write("\t".join(cluster)+"\n")
    logger("  %3s bp in %s clusters generated from %s contigs."%(totsize, len(clusters), totwindows))
                
def test(bam=["/mnt/data/lpryszcz/cluster/hic/arath/idba/SRR2626163.contig.fa.bam"], fasta="/mnt/data/lpryszcz/cluster/hic/arath/idba/SRR2626163.contig.fa", outdir="/mnt/data/lpryszcz/cluster/hic/arath/idba/bam2scaffolds.v01d", ref="/mnt/data/lpryszcz/cluster/hic/arath/ref/Ath.fa", minSize=2000):
    
    minchr=3
    if len(sys.argv)>4:
        bam, fasta, outdir, ref = sys.argv[1:5]
        bam = [bam]
    if len(sys.argv)>5:
        minchr = int(sys.argv[5])
        
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    
    clusters = bam2clusters(bam, fasta, outdir, minSize=minSize, minchr=minchr)
    
    
    # generate & load contig2chrom
    if not os.path.isfile("%s.bed"%fasta):
        # generate index
        if not os.path.isfile("%s.suf"%ref):
            os.system("lastdb %s %s"%(ref, ref))
        # generate chromosome to tab
        os.system("lastal -l 100 -C 2 -P 4 %s %s | last-split - | maf-convert tab - | tab2chromosome.py > %s.bed"%(ref, fasta, fasta))
        
    c2chr = {l.split('\t')[3]: get_name(l.split('\t')[0]) for l in open("%s.bed"%fasta)}
    
    # get contig2size
    faidx = FastaIndex(fasta)
    contig2size = {c: faidx.id2stats[c][0] for c in faidx}
    
    # assign clusters to chr
    votes = []
    ncontigs = totsize = totcsize = 0
    for i, cluster in enumerate(clusters, 1):
        if not cluster: continue
        c = Counter(get_name(c2chr[_c]) for _c in cluster if _c in c2chr)
        if not c: continue
        chrom = c.most_common(1)[0][0]
        _votes = [1 if c2chr[_c]==chrom else 0 for _c in cluster if _c in c2chr]
        csize = sum([contig2size[_c] for _c in cluster if _c in c2chr and c2chr[_c]==chrom])
        size = sum([contig2size[_c] for _c in cluster if _c in c2chr])
        #if len(cluster)>100:
        print i, len(cluster), chrom, round(np.mean(_votes),3), csize, round(1.*csize/size,3), c.most_common(3)
        votes += _votes
        totcsize += csize
        totsize += size
        ncontigs += len(cluster)
    print "%s bp in %s contigs in %s clusters %.2f%s accuracy; %s bp correct [%.2f%s]"%(totsize, ncontigs, i, 100*np.mean(votes), '%', totcsize, 100.*totcsize/totsize, '%s')
    
    return clusters
            
if __name__=="__main__":
    t0 = datetime.now()
    #main()
    test()
    dt = datetime.now()-t0
    sys.stderr.write("#Time elapsed: %s\n"%dt)