From b336febfa224ffe9f97340281141fafdb75e5e18 Mon Sep 17 00:00:00 2001 From: Yaobo Xu Date: Wed, 9 Oct 2019 15:04:56 +0000 Subject: [PATCH 1/5] patched tophat-fusion-post --- Dockerfile | 2 +- build/opt-build.sh | 19 +- build/patches/tophat-fusion-post.patch | 2929 ++++++++++++++++++++++++ 3 files changed, 2941 insertions(+), 9 deletions(-) create mode 100644 build/patches/tophat-fusion-post.patch diff --git a/Dockerfile b/Dockerfile index d713549..5e99827 100644 --- a/Dockerfile +++ b/Dockerfile @@ -78,7 +78,7 @@ ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 # build tools from other repos -ADD build/opt-build.sh build/ +ADD build build/ RUN bash build/opt-build.sh $OPT # build the tools in this repo, separate to reduce build time on errors diff --git a/build/opt-build.sh b/build/opt-build.sh index 26285f8..fae35a6 100755 --- a/build/opt-build.sh +++ b/build/opt-build.sh @@ -144,14 +144,17 @@ if [ ! -e $SETUP_DIR/starfusion.success ]; then touch $SETUP_DIR/starfusion.success fi -# Install tophat -if [ ! -e $SETUP_DIR/tophat.success ]; then - curl -sSL --retry 10 https://ccb.jhu.edu/software/tophat/downloads/tophat-${VER_TOPHAT}.Linux_x86_64.tar.gz > distro.tar.gz - rm -rf distro/* - tar --strip-components 1 -C distro -zxf distro.tar.gz - rm -f distro/AUTHORS distro/LICENSE distro/README - cp -r distro/* $INST_PATH/bin/. - touch $SETUP_DIR/tophat.success +# Install tophat +if [ ! -e $SETUP_DIR/tophat.success ]; then + curl -sSL --retry 10 https://ccb.jhu.edu/software/tophat/downloads/tophat-${VER_TOPHAT}.Linux_x86_64.tar.gz > distro.tar.gz + rm -rf distro/* + tar --strip-components 1 -C distro -zxf distro.tar.gz + rm -f distro/AUTHORS distro/LICENSE distro/README + cp -r distro/* $INST_PATH/bin/. + # patch tophat-fusion-post + cp $SCRIPT_PATH/patches/tophat-fusion-post.patch $INST_PATH/bin/tophat-fusion-post + chmod +x $INST_PATH/bin/tophat-fusion-post + touch $SETUP_DIR/tophat.success fi # Install defuse diff --git a/build/patches/tophat-fusion-post.patch b/build/patches/tophat-fusion-post.patch new file mode 100644 index 0000000..d2a74b2 --- /dev/null +++ b/build/patches/tophat-fusion-post.patch @@ -0,0 +1,2929 @@ +#!/usr/bin/env python + + +""" +tophat-fusion-post + +Created by Daehwan Kim on 2011-05-05. +Copyright (c) 2011 Daehwan Kim. All rights reserved. +""" + +import sys, getopt, warnings +import os, subprocess, errno +import copy +import string, re +import random +from datetime import datetime, date, time +import math +from collections import defaultdict +from intervaltree import Interval, IntervalTree + +use_message = ''' +TopHat-Fusion + +Usage: + tophat-fusion-post [options] + +Options: + -v/--version + -o/--output-dir [ default: ./tophatfusion_out ] + --num-fusion-reads [ default: 3 ] + --num-fusion-pairs [ default: 2 ] + --num-fusion-both [ default: 5 ] + --max-num-fusions [ default: 500 ] + --fusion-read-mismatches [ default: 2 ] + --fusion-multireads [ default: 2 ] + --non-human + + -p/--num-threads [ default: 1 ] + + --no-filter-by-annotation + + --skip-fusion-kmer + --skip-filter-fusion + --skip-blast + --skip-read-dist + --skip-html + + --tex-table + + --fusion-pair-dist [ default: 250 ] +''' + + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + + +output_dir = "./tophatfusion_out/" +logging_dir = output_dir + "logs/" +tmp_dir = output_dir + "tmp/" + +class TopHatFusionParams: + def __init__(self, + keep_tmp = ""): + self.keep_tmp = keep_tmp + + self.num_fusion_reads = 3 + self.num_fusion_pairs = 2 + self.num_fusion_both = 0 + self.max_num_fusions = 500 + + self.fusion_read_mismatches = 2 + self.fusion_multireads = 2 + + self.is_human = True + + self.num_threads = 1 + + self.filter_by_annotation = True + + self.skip_fusion_kmer = False + self.skip_filter_fusion = False + self.skip_blast = False + self.skip_read_dist = False + self.skip_html = False + + self.tex_table = False + + self.fusion_pair_dist = 250 + + def check(self): + if False: + die("Error: arg to --num-threads must be greater than 0") + + def parse_options(self, argv): + try: + opts, args = getopt.getopt(argv[1:], "hp:o:", + ["version", + "help", + "output-dir=", + "num-fusion-reads=", + "num-fusion-pairs=", + "num-fusion-both=", + "max-num-fusions=", + "fusion-read-mismatches=", + "fusion-multireads=", + "non-human", + "num-threads=", + "no-filter-by-annotation", + "skip-fusion-kmer", + "skip-filter-fusion", + "skip-blast", + "skip-read-dist", + "skip-html", + "tex-table", + "gtf-file=", + "fusion-pair-dist="]) + except getopt.error, msg: + raise Usage(msg) + + for option, value in opts: + if option in ("-v", "--version"): + print "TopHat v%s" % (get_version()) + sys.exit(0) + if option in ("-h", "--help"): + raise Usage(use_message) + if option == "--num-fusion-reads": + self.num_fusion_reads = int(value) + if option == "--num-fusion-pairs": + self.num_fusion_pairs = int(value) + if option == "--num-fusion-both": + self.num_fusion_both = int(value) + if option == "--max-num-fusions": + self.max_num_fusions = int(value) + if option == "--fusion-read-mismatches": + self.fusion_read_mismatches = int(value) + if option == "--fusion-multireads": + self.fusion_multireads = int(value) + if option == "--non-human": + self.is_human = False + if option in ("-p", "--num-threads"): + self.num_threads = int(value) + if option in ("-o", "--output-dir"): + global output_dir, logging_dir, tmp_dir + output_dir = value + "/" + logging_dir = output_dir + "logs/" + tmp_dir = output_dir + "tmp/" + if option == "--no-filter-by-annotation": + self.filter_by_annotation = False + if option == "--skip-fusion-kmer": + self.skip_fusion_kmer = True + if option == "--skip-filter-fusion": + self.skip_filter_fusion = True + if option == "--skip-blast": + self.skip_blast = True + if option == "--skip-read-dist": + self.skip_read_dist = True + if option == "--skip-html": + self.skip_html = True + if option == "--tex-table": + self.tex_table = True + if option == "--gtf-file": + self.GTF_file = value + if option == "--fusion-pair-dist": + self.fusion_pair_dist = int(value) + + if len(args) < 1: + raise Usage(use_message) + + return args + + +# Returns the current time in a nice format +def right_now(): + curr_time = datetime.now() + return curr_time.strftime("%c") + + +def reverse_complement(seq): + result = "" + for nt in seq: + base = nt + if nt == 'A': + base = 'T' + elif nt == 'a': + base = 't' + elif nt == 'C': + base = 'G' + elif nt == 'c': + base = 'g' + elif nt == 'G': + base = 'C' + elif nt == 'g': + base = 'c' + elif nt == 'T': + base = 'A' + elif nt == 't': + base = 'a' + + result = base + result + + return result + + +def get_chromosome_order(bowtie_index, chrs): + bowtie_header_cmd = ['bowtie', + '--sam', + bowtie_index, + '/dev/null'] + + bowtie_header_filename = tmp_dir + "sam_header" + subprocess.call(bowtie_header_cmd, + stdout=open(bowtie_header_filename, "w"), + stderr=open("/dev/null")) + + bowtie_header_file = open(bowtie_header_filename) + for line in bowtie_header_file: + index = string.find(line, "SN:") + if index != -1: + SN = line[index+3:].split() + chrs.append(SN[0]) + + bowtie_header_file.close() + + +# Ensures that the output, logging, and temp directories are present. If not, +# they are created +def prepare_output_dir(): + if os.path.exists(output_dir): + pass + else: + os.mkdir(output_dir) + + if os.path.exists(logging_dir): + pass + else: + os.mkdir(logging_dir) + + if os.path.exists(tmp_dir): + pass + else: + try: + os.makedirs(tmp_dir) + except OSError, o: + die("\nError creating directory %s (%s)" % (tmp_dir, o)) + + +def check_samples(): + sample_list_filename = output_dir + 'sample_list.txt' + + prev_list, curr_list = [], [] + if os.path.exists(sample_list_filename): + sample_list = open(sample_list_filename, 'r') + for line in sample_list: + prev_list.append(line[:-1]) + sample_list.close() + + for dir in sorted(os.listdir('.')): + if string.find(dir, "tophat_") != 0: + continue + + fusion = dir + "/fusions.out" + if not os.path.exists(fusion): + continue + + curr_list.append(dir[7:]) + + if prev_list != curr_list: + sample_list = open(sample_list_filename, 'w') + for sample in curr_list: + print >> sample_list, sample + sample_list.close() + return True + else: + return False + + +def map_fusion_kmer(bwt_idx_prefix, params, sample_update = False): + def get_fusion_seq(): + seq_dic = {} + for dir in sorted(os.listdir('.')): + if string.find(dir, "tophat_") != 0: + continue + + fusion = dir + "/fusions.out" + if not os.path.exists(fusion): + continue + + fusion_file = open(fusion, 'r') + fusion_file.readline() + for line in fusion_file: + left_seq, right_seq = line[:-1].split('\t@\t')[2:4] + left_seq = left_seq.split(' ')[0] + right_seq = right_seq.split(' ')[1] + + if len(left_seq) < 23 or len(right_seq) < 23: + continue + + seq_dic[left_seq[-23:]] = 1 + seq_dic[right_seq[:23]] = 1 + + fusion_file.close() + + fusion_seq_fa = open(output_dir + "fusion_seq.fa", 'w') + for seq in seq_dic.keys(): + print >> fusion_seq_fa, ">%s" % seq + print >> fusion_seq_fa, seq + + fusion_seq_fa.close() + + def convert_bowtie(): + bwt_dic = {} + bwtout_file = open(output_dir + "fusion_seq.bwtout", 'r') + for line in bwtout_file: + seq, temp, chr, coord = line[:-1].split('\t')[0:4] + if seq in bwt_dic: + bwt_dic[seq].append(chr + ":" + coord) + else: + bwt_dic[seq] = [chr + ":" + coord] + + bwtout_file.close() + + kmer_map = open(fusion_kmer_file_name, 'w') + for seq, chrs in bwt_dic.items(): + print >> kmer_map, "%s\t%s" % (seq, ','.join(chrs)) + + kmer_map.close() + + print >> sys.stderr, "[%s] Extracting 23-mer around fusions and mapping them using Bowtie" % right_now() + if sample_update: + print >> sys.stderr, "\tsamples updated" + + fusion_kmer_file_name = output_dir + "fusion_seq.map" + + if not os.path.exists(fusion_kmer_file_name) or \ + os.stat(fusion_kmer_file_name).st_size <= 0 or \ + sample_update: + get_fusion_seq() + cmd = ['bowtie', '-p', '8', '-a', '-n', '3', '-m', '100', bwt_idx_prefix, '-f', '%sfusion_seq.fa' % output_dir] + subprocess.call(cmd, stdout=open(output_dir + 'fusion_seq.bwtout', 'w'), stderr=open('/dev/null', 'w')) + convert_bowtie() + + +def filter_fusion(bwt_idx_prefix, params): + chrs = [] + get_chromosome_order(bwt_idx_prefix, chrs) + + chr_order = {} + for chr in chrs: + chr_order[chr] = len(chr_order) + + ### Compute number of paired reads ### + # Helper classes + class GeneEntry(object): + fields = [('bin', int), + ('name', str), + ('chrom', str), + ('strand', str), + ('txStart', int), + ('txEnd', int), + ('cdsStart', int), + ('cdsEnd', int), + ('exonCount', int), + ('exonStarts', lambda x: [int(y) for y in list(x.split(',')[:-1])]), + ('exonEnds', lambda x: [int(y) for y in list(x.split(',')[:-1])]), + ('score', int), + ('name2', str), + ('cdsStartStat', str), + ('cdsEndStat', str), + ('exonFrames', lambda x: [int(y) for y in list(x.split(',')[:-1])])] + + '''Representation of one line in the refGene.txt file.''' + def __init__(self, entry_string): + tokens = entry_string.split('\t') # Assumes tab delimited. + for (field, type_), token in zip(self.fields, tokens): + setattr(self, field, type_(token)) + + def __str__(self): + return str(self.__dict__) + + def __repr__(self): + return str(self) + + class Fusion(object): + def __init__(self, info, pairs): + # args should be info[:4] from the fusions.out entry + # Parse the chromosome string + self.chrL, self.chrR = info[0].rstrip().split('-') + + # Fusion positions + self.posL = int(info[1]) + self.posR = int(info[2]) + + # Strand + self.strandL = info[3][0] + self.strandR = info[3][1] + + # Pairs + pairs = pairs.rstrip() + if pairs: + self.pairs = [pair.split(':') for pair in pairs.split(' ')] + self.pairs = [(int(p1), int(p2)) for (p1, p2) in self.pairs] + else: + self.pairs = [] + + def get_sign(self, which): + switch = which == 'R' + strand = self.strandL if which == 'L' else self.strandR + if strand == 'r': + return -1 if not switch else 1 + else: + return 1 if not switch else -1 + + class TransMaps(object): + def __init__(self, fusion, juncs): + self.fusion = fusion + self.chroms = (fusion.chrL, fusion.chrR) + self.maps = {} + self.starts = {} + self.juncs = juncs + + def add_map(self, chrom, start, stop, strand, fusion_pos): + self.maps[chrom, strand] = self.compute_transcript_map(chrom, start, stop, strand, fusion_pos) + self.starts[chrom, strand] = start + + def map(self, chrom, pos, strand): + pos2 = pos - self.starts[chrom,strand] + the_map = self.maps[chrom,strand] + if pos2 < 0 or pos2 >= len(the_map): + # out of bounds - return distance to boundary + if pos2 < 0: + return the_map[0] - pos2 + else: + return the_map[-1] + (pos2 - len(the_map) + 1) + else: + # position in the map + return the_map[pos2] + + def compute_transcript_map(self, chrom, start, stop, strand, fusion_pos): + #return list(range(0, stop-start+1)) + + chrom = self.chroms[chrom] + w = stop - start + 1 + + # Find junctions within the interval + # strict=True means that only intervals entirely contained are returned + # The junction dict contains both {end: start} and {start: end} + junctions = defaultdict(set) + for junc in self.juncs[chrom].search(start, stop, strict=True): + if junc[2][0] == strand: + junctions[junc[1]-start].add(junc[0]-start) + junctions[junc[0]-start].add(junc[1]-start) + + # Initialize distance vector + distance = [abs(ii-fusion_pos) for ii in range(start, stop+1)] + + # Working out from fusion position, follow junctions + # First, get positions in the right order + fusii = fusion_pos - start + N = len(distance) + positions = [fusii] + up = 1 + down = 1 + while len(positions) < len(distance): + if fusii - down >= 0: + positions.append(fusii-down) + down += 1 + if fusii + up < N: + positions.append(fusii+up) + up += 1 + + # Second, compute the transcript distance at each position + # Skip the first position, which is the fusion break (distance=0) + for ii in positions[1:]: + ii_ = ii + (1 if fusii-ii >= 0 else -1) + if ii in junctions: + # If ends of junction are further away, ignore them. + # If ends of junction are closer to fusion, shorten distance + distance[ii] = min( [distance[ii_]] + [distance[jj] for jj in junctions[ii] + if abs(jj-fusii) < abs(ii-fusii)] ) + 1 + else: + distance[ii] = distance[ii_] + 1 + + # Re-sign the distances + # Users downstream will need to work out fusion arm orientation + distance[:fusii] *= -1 + return distance + + # Helper functions + def load_junctions(refgene_file, ensgene_file, juncs_file): + def _load(gene_file, _junctions): + for line in open(gene_file): + entry = GeneEntry(line) + # Iterate over introns + for start, stop in zip(entry.exonEnds[:-1], entry.exonStarts[1:]): + _junctions[entry.chrom].addi(start, stop+1, (entry.strand, 'intron')) + + # junctions are Interval(start, stop, (strand, ...)) + _junctions = defaultdict(IntervalTree) + + # Load exons from refGene.txt and ensGene.txt + if refgene_file is not None: + _load(refgene_file, _junctions) + + if ensgene_file is not None: + _load(ensgene_file, _junctions) + + # Load junctions from junctions.bed + if juncs_file is not None: + for line in open(juncs_file): + if line[:5] == 'track': + continue + chrom, start, stop, _, _, strand, _, _, _, _, overhangs, _ = line.split('\t') + a,b = overhangs.split(',') + _junctions[chrom].addi( int(start)+int(a), int(stop)-int(b)+2, (strand, 'intron') ) + + return _junctions + + def get_transcript_maps(fusion, junctions): + # Find left side range + p1s = [p1 for p1, _ in fusion.pairs] + dL = max([0, max(p1s)]) + dL2 = min([0, min(p1s)]) + + s = fusion.get_sign('L') + startL = fusion.posL - s*dL + endL = fusion.posL - s*dL2 + if startL > endL: + startL, endL = endL, startL + + # Find right side range + p2s = [p2 for _, p2 in fusion.pairs] + dR = max([0, max(p2s)]) + dR2 = min([0, min(p2s)]) + + s = fusion.get_sign('R') + startR = fusion.posR - s*dR + endR = fusion.posR - s*dR2 + if startR > endR: + startR, endR = endR, startR + + # Make maps + tmaps = TransMaps(fusion, junctions) + tmaps.add_map(0, startL, endL, '-', fusion.posL) + tmaps.add_map(0, startL, endL, '+', fusion.posL) + tmaps.add_map(1, startR, endR, '-', fusion.posR) + tmaps.add_map(1, startR, endR, '+', fusion.posR) + + for m in tmaps.maps.values(): + if len(m)==0: + raise Exception() + + return tmaps + + def pick_short(a, b): + return a if abs(a) < abs(b) else b + + def convert_pair_distances(fusion, junctions): + # Define transcript maps + # These map genomic coordinates to transcript coordinates. + tmaps = get_transcript_maps(fusion, junctions) + + # Compute inner distances + # Because we don't know which strand, we'll take the min of the + # transcript distances on each strand. + pairs = [] + for p1, p2 in fusion.pairs: + s1 = fusion.get_sign('L') + p1 = p1 * s1 + d1 = s1* pick_short( tmaps.map(0, fusion.posL, '-') - tmaps.map(0, fusion.posL - p1, '-'), + tmaps.map(0, fusion.posL, '+') - tmaps.map(0, fusion.posL - p1, '+') ) + + s2 = fusion.get_sign('R') + p2 = p2 * s2 + d2 = s2* pick_short( tmaps.map(1, fusion.posR, '-') - tmaps.map(1, fusion.posR - p2, '-'), + tmaps.map(1, fusion.posR, '+') - tmaps.map(1, fusion.posR - p2, '+') ) + + pairs.append((d1,d2)) + + return pairs + + def get_valid_pairs(info, pairs, junctions): + # This method converts the distances from spanning pairs to the fusion + # from genomic to transcript coordinates - i.e. how far are the pairs + # in the processed transcript. + # Parse fusion + fusion = Fusion(info, pairs) + + if len(fusion.pairs) == 0: + return [] + + # Convert pairs to transcript coordinates + tpairs = convert_pair_distances(fusion, junctions) + + # Return pairs with inner distance less than the threshold + + pairs = [(a,b) for (a,b) in tpairs if abs(a)+abs(b) <= params.fusion_pair_dist] + return pairs + + ## Filter fusion implementation ## + def filter_fusion_impl(fusion, junctions, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list): + def gene_exists(gene_list, chr, coord, dir, is_left): + min = 0 + max = len(gene_list) - 1 + while max - min >= 0: + mid = (min + max) / 2 + gene = gene_list[mid] + ref_chr = gene[1] + + if chr != ref_chr: + if chr_order[chr] < chr_order[ref_chr]: + max = mid - 1 + else: + min = mid + 1 + continue + + left_coord = gene[2] + right_coord = gene[3] + + if coord >= left_coord and coord <= right_coord: + left_coords = gene[5] + right_coords = gene[6] + sense = gene[7] + + belong = False + where = "outside" + for i in range(len(left_coords)): + # gives some relax! + relax = 3 + + left = int(left_coords[i]) - 1 + right = int(right_coords[i]) - 1 + if coord <= right + relax: + if coord < left - relax: + where = "intron%d(%d-%d)" % (i, int(right_coords[i-1]), left - 1) + else: + if ((is_left and dir == "f") or (not is_left and dir == "r")) and abs(coord - right) <= relax: + belong = True + + if ((is_left and dir == "r") or (not is_left and dir == "f")) and abs(coord - left) <= relax: + belong = True + + where = "exon%d(%d-%d)" % (i + 1, left, right) + break + + return [gene[0], gene[4], where, belong, sense] + elif coord < left_coord: + max = mid - 1 + else: + min = mid + 1 + + return ["N/A", "N/A", "N/A", False, "N/A"] + + def how_diff(first, second): + seq_len = len(first) + + min_value = 10000 + prev, curr = [0 for i in range(seq_len)], [0 for i in range(seq_len)] + for j in range(seq_len): + for i in range(seq_len): + value = 10000 + if first[i] == second[j]: + match = 0 + else: + match = 1 + + # right + if i == 0: + value = j * 2 + match + elif j > 0: + value = prev[i] + 2 + + temp_value = 10000 + + # down + if j == 0: + temp_value = i * 2 + match + elif i > 0: + temp_value = curr[i-1] + 2 + + if temp_value < value: + value = temp_value + + # match + if i > 0 and j > 0: + temp_value = prev[i-1] + match + + if temp_value < value: + value = temp_value + + curr[i] = value + + if (i == seq_len - 1 or j == seq_len - 1) and value < min_value: + min_value = value + + prev, curr = curr, prev + + return min_value + + kmer_len = len(seq_chr_dic.keys()[0]) + sample_name = fusion.split("/")[0][len("tophat_"):] + + data = os.getcwd().split('/')[-1] + + fusion_file = open(fusion, 'r') + fusion_file.readline() + for line in fusion_file: + # Parse the fusion entry + info, sim, left_seq_org, right_seq_org, left_dist, right_dist, pair_list = line[:-1].split('\t@\t')[:7] + info = info.split('\t') + if sim.strip() == "": + continue + sim = sim.split(' ') + left_seq = left_seq_org.replace(' ', '') + right_seq = right_seq_org.replace(' ', '') + + num_reads = int(info[4]) + + # Correct the number of spanning pairs by filtering on the inner distance + # based on transcript coordinates instead of genomic coordinates + pairs = get_valid_pairs(info[:4], pair_list, junctions) + num_pair_ends = len(pairs) + + # Format pairs for usage downstream + pairs = ['{0}:{1}'.format(a,b) for (a,b) in pairs] + + num_pair_ends_fusion = int(info[6]) + num_pair_ends_both = int(num_pair_ends + num_pair_ends_fusion * 0.5) + num_contracting_reads = int(info[7]) + left_ext = int(info[8]) + right_ext = int(info[9]) + sym = float(info[10]) + + half_len = len(left_seq) / 2 + chr1, chr2 = info[0].split('-')[:2] + coord1, coord2 = int(info[1]), int(info[2]) + dir = info[3] + + if string.find(sample_name, "single") != -1: + single = True + else: + single = False + + # Begin fusion filters + if left_ext < 16 or right_ext < 16: + continue + + both = num_reads + num_pair_ends_both + all = both + + if num_pair_ends > num_reads * 50: + continue + + if num_reads < params.num_fusion_reads or \ + num_pair_ends < params.num_fusion_pairs or \ + both < params.num_fusion_both: + continue + + """ + if (chr1 != chr2 and num_contracting_reads > num_reads) or \ + (chr1 == chr2 and num_contracting_reads > all + num_pair_ends + 5): + continue + """ + + # are the sequences around the breakpoint different enough? + if int(sim[0]) < 8: + continue + + # are the reads distributed symmetrically? + if sym >= 22 + max(0, 6 - num_reads): + continue + + max_intron_len = 100000 + if chr1 == chr2 and dir == "ff": + coord_dif = coord2 - coord1 + if coord_dif > 0 and coord_dif < max_intron_len: + continue + + if not left_seq[half_len-kmer_len:half_len] in seq_chr_dic or not right_seq[half_len:half_len+kmer_len] in seq_chr_dic: + continue + + left_chrs = seq_chr_dic[left_seq[half_len-kmer_len:half_len]] + right_chrs = seq_chr_dic[right_seq[half_len:half_len+kmer_len]] + + if chr1 == chr2: + max_intron_len = min(max_intron_len, abs(coord1 - coord2) * 9 / 10) + + same = False + + def split_chr_coord(chr_coord): + coord = chr_coord.split(':')[-1] + return chr_coord.rstrip(':' + coord), coord + + for chr_coord in left_chrs: + chr, coord = split_chr_coord(chr_coord) + coord = int(coord) + if chr == chr2 and abs(coord - coord2) < max_intron_len: + same = True + break + + if same: + continue + + for chr_coord in right_chrs: + chr, coord = split_chr_coord(chr_coord) + coord = int(coord) + if chr == chr1 and abs(coord - coord1) < max_intron_len: + same = True + break + + if same: + continue + + def find_gene(chr, coord, one_dir, is_left): + result = [] + for gene_list in [refGene_list, ensGene_list]: + result.append(gene_exists(gene_list, chr, coord, one_dir, is_left)) + + if result[0][0] == "N/A": + return result[1] + result[1][:2] + else: + return result[0] + result[1][:2] + + dir = info[3] + gene1, gene1_name, gene1_where, gene1_belong, gene1_sense, ens_gene1, ens_gene1_name = find_gene(chr1, coord1, dir[0], True) + gene2, gene2_name, gene2_where, gene2_belong, gene2_sense, ens_gene2, ens_gene2_name = find_gene(chr2, coord2, dir[1], False) + if params.filter_by_annotation: + if gene1_name == gene2_name or ens_gene1_name == ens_gene2_name or ens_gene1 == ens_gene2: + continue + + if gene1 == "N/A" or gene2 == "N/A" or (string.find(gene1, "ENS") == 0 and string.find(gene2, "ENS") == 0): + continue + + left_diff = how_diff(left_seq[half_len - 20:half_len], right_seq[half_len - 20:half_len]) + if left_diff <= 8: + continue + + right_diff = how_diff(left_seq[half_len:half_len+20], right_seq[half_len:half_len+20]) + if right_diff <= 8: + continue + + if left_diff + right_diff < 20: + continue + + left_dist = left_dist.strip().split(' ') + right_dist = right_dist.strip().split(' ') + + for i in range(len(left_dist)): + left_dist[i] = '%d' % min(9, int(left_dist[i])) + right_dist[i] ='%d' % min(9, int(right_dist[i])) + + swap = False + if (dir == 'ff' and gene1_sense == '-' and gene2_sense == '-') or \ + (dir == 'rr' and gene1_sense == '+' and gene2_sense == '+') or \ + (dir == 'fr' and gene1_sense == '-' and gene2_sense == '+') or \ + (dir == 'rf' and gene1_sense == '+' and gene2_sense == '-'): + swap = True + + if swap: + if dir == 'ff': + dir = 'rr' + elif dir == 'rr': + dir = 'ff' + + info[0] = "%s-%s" % (chr2, chr1) + info[1:3] = info[1:3][::-1] + info[3] = dir + info[8:10] = info[8:10][::-1] + left_seq_org, right_seq_org = reverse_complement(right_seq_org), reverse_complement(left_seq_org) + left_dist, right_dist = right_dist, left_dist + gene1_name, gene1_where, gene1_sense, gene2_name, gene2_where, gene2_sense = \ + gene2_name, gene2_where, gene2_sense, gene1_name, gene1_where, gene1_sense + for j in range(len(pairs)): + pair = pairs[j].split(':') + pairs[j] = ':'.join(pair[::-1]) + + # check if this is due to trans-splicing. + if gene1_sense in "+-" and gene2_sense in "+-": + if ((dir == 'ff' or dir == 'rr') and gene1_sense != gene2_sense) or \ + (dir == 'fr' and (gene1_sense != '+' or gene2_sense != '-')) or \ + (dir == 'rf' and (gene1_sense != '-' or gene2_sense != '+')): + # print >> sys.stderr, "fusion due to trans-splicing", info + # print >> sys.stderr, gene1, gene1_name, gene1_where, gene1_sense + # print >> sys.stderr, gene2, gene2_name, gene2_where, gene2_sense + None + + fusion_gene = [] + info[5] = str(num_pair_ends) + fusion_gene.append(sample_name + ' ' + ' '.join(info[:10])) + fusion_gene.append(left_seq_org) + fusion_gene.append(right_seq_org) + fusion_gene.append("%s %s" % (''.join(left_dist[::-1]), ''.join(right_dist))) + + fusion_gene.append("%s %s %s %s" % (gene1_name, gene1_where, gene2_name, gene2_where)) + fusion_gene.append(" ".join(pairs)) + + fusion_gene_list.append(fusion_gene) + + fusion_file.close() + + print >> sys.stderr, "[%s] Filtering fusions" % right_now() + + seq_chr_dic = {} + seq_chr_file = open(output_dir + "fusion_seq.map", 'r') + for line in seq_chr_file: + seq, chrs = line[:-1].split('\t') + chrs = chrs.split(',') + seq_chr_dic[seq] = chrs + + seq_chr_file.close() + + re_mir = re.compile(r'^(MIR)') + def read_genes(gene_file_name, offset = 1, id = -4): + gene_list, temp_gene_list = [], [] + if not os.path.exists(gene_file_name): + return gene_list + + gene_file = open(gene_file_name, 'r') + for line in gene_file: + line = line[:-1].split('\t')[offset:] + num_exons = int(line[7]) + left_coords = line[8].split(',')[:num_exons] + right_coords = line[9].split(',')[:num_exons] + + if line[1] in chr_order and not re_mir.findall(line[id]): + temp_gene_list.append([line[0], line[1], int(line[3]), int(line[4]), line[id], left_coords, right_coords, line[2]]) + + gene_file.close() + + def my_cmp(a, b): + if a[1] != b[1]: + if chr_order[a[1]] < chr_order[b[1]]: + return -1 + else: + return 1 + else: + if a[2] != b[2]: + if a[2] < b[2]: + return -1 + else: + return 1 + else: + if a[3] > b[3]: + return -1 + else: + return 1 + + temp_gene_list = sorted(temp_gene_list, cmp=my_cmp) + gene_list = [] + if len(temp_gene_list) >= 1: + gene_list.append(temp_gene_list[0]) + + # remove overlapping genes, that is, the longest genes are preferred + for i in range(1, len(temp_gene_list)): + gene1 = temp_gene_list[i-1] + gene2 = temp_gene_list[i] + + if gene1[1] == gene2[1] and gene1[3] >= gene2[3]: + continue + + gene_list.append(gene2) + + return gene_list + + refGene_list, ensGene_list = [], [] + if os.path.exists("refGene.txt"): + refGene_list = read_genes("refGene.txt") + if os.path.exists("ensGene.txt"): + ensGene_list = read_genes("ensGene.txt") + + fusion_gene_list = [] + for file in sorted(os.listdir(".")): + if string.find(file, "tophat_") != 0: + continue + + fusion_file = file + "/fusions.out" + if not os.path.exists(fusion_file): + continue + + juncs_file = file + '/junctions.bed' + if not os.path.exists(juncs_file): + juncs_file = None + print >> sys.stderr, 'Warning: could not find juctions.bed (%s).' % (juncs_file) + + ref_file = "refGene.txt" + if not os.path.exists(ref_file): + ref_file = None + + ens_file = "ensGene.txt" + if not os.path.exists(ens_file): + ens_file = None + + if juncs_file is None and ref_file is None and ens_file is None: + print >> sys.stderr, 'Warning: neither junctions.bed nor ref/ensGene.txt found.' + + junctions = load_junctions(ref_file, ens_file, juncs_file) + + print >> sys.stderr, "\tProcessing:", fusion_file + filter_fusion_impl(fusion_file, junctions, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list) + + fusion_out_file = output_dir + "potential_fusion.txt" + output_file = open(fusion_out_file, 'w') + for fusion_gene in fusion_gene_list: + for line in fusion_gene: + print >> output_file, line + output_file.close() + print >> sys.stderr, '\t%d fusions are output in %s' % (len(fusion_gene_list), fusion_out_file) + + + +def parallel_work(pids, work): + child = -1 + for i in range(len(pids)): + if pids[i] == 0: + child = i + break + + while child == -1: + status = os.waitpid(0, 0) + for i in range(len(pids)): + if status[0] == pids[i]: + child = i + pids[i] = 0 + break + + child_id = os.fork() + if child_id == 0: + work() + os._exit(os.EX_OK) + else: + # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id) + pids[child] = child_id + + +def wait_pids(pids): + for pid in pids: + if pid > 0: + os.waitpid(pid, 0) + + +def do_blast(params): + print >> sys.stderr, "[%s] Blasting 50-mers around fusions" % right_now() + + file_name = output_dir + "potential_fusion.txt" + blast_genomic_out = output_dir + "blast_genomic" + blast_nt_out = output_dir + "blast_nt" + + if not os.path.exists(blast_genomic_out): + os.system("mkdir %s" % blast_genomic_out); + + if not os.path.exists(blast_nt_out): + os.system("mkdir %s" % blast_nt_out) + + pids = [0 for i in range(params.num_threads)] + + count = 0 + line_no = 0 + + check_list = [] + output_list = [] + output = "" + file = open(file_name, 'r') + for line in file: + if line_no % 6 == 0: + count += 1 + + if line_no % 6 == 1: + left_seq = line[:-1].split(" ")[0] + + if line_no % 6 == 2: + right_seq = line[:-1].split(" ")[1] + + if line_no % 6 == 4: + def blast(database, seq, outdir): + file_name = "%s/%s" % (outdir, seq) + if os.path.exists(file_name): + return + + blast_cmd = "echo %s | blastn -db %s -evalue 1e-10 -word_size 28" + # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-10 -W 28" + output = os.popen(blast_cmd % (seq, database)).read() + + if str.find(output, "No hits found") != -1: + blast_cmd = "echo %s | blastn -db %s -evalue 1e-5" + # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-5" + output = os.popen(blast_cmd % (seq, database)).read() + + pos1 = str.find(output, ">ref|") + pos2 = str.find(output, "Database: ", pos1) + + if pos1 != -1 and pos1 < pos2: + output = output[pos1:pos2].rstrip() + else: + output = "" + + file = open(file_name, "w") + file.write(output) + file.close() + + seq = left_seq + right_seq + + def work(): + if params.is_human: + blast_genomic = "blast/human_genomic" + else: + blast_genomic = "blast/other_genomic" + + blast_nt = "blast/nt" + + blast(blast_genomic, left_seq, blast_genomic_out) + blast(blast_genomic, right_seq, blast_genomic_out) + blast(blast_genomic, seq, blast_genomic_out) + blast(blast_nt, left_seq, blast_nt_out) + blast(blast_nt, right_seq, blast_nt_out) + blast(blast_nt, seq, blast_nt_out) + + if not os.path.exists(output_dir + "blast_nt/" + seq ): + print >> sys.stderr, "\t%d. %s" % (count, line[:-1]) + if params.num_threads <= 1: + work() + else: + parallel_work(pids, work) + + line_no += 1 + + if params.num_threads > 1: + wait_pids(pids) + + +def read_dist(params): + def alignments_region(sample_name, bam, alignments_list): + def output_region(sample_name, fusion, reads): + chr_ref, pos1_ref, pos2_ref, dir_ref = fusion + chr1_ref, chr2_ref = chr_ref.split('-') + + if len(reads) <= 0: + return + + alignments_file_name = "%s/%s_%s_%d_%d_%s" % (alignments_dir, sample_name, chr_ref, pos1_ref, pos2_ref, dir_ref) + alignments_file = open(alignments_file_name, "w") + + def my_cmp(a, b): + if a[3] and not b[3]: + return -1 + elif not a[3] and b[3]: + return 1 + + if a[3]: + if dir_ref[0] == "f": + return a[4] - b[4] + else: + return b[4] - a[4] + else: + if dir_ref[1] == "f": + return a[4] - b[4] + else: + return b[4] - a[4] + + reads = sorted(reads, cmp=my_cmp) + begin_pos = reads[0][4] + dist_to_breakpoint = abs(pos1_ref - begin_pos) + for read in reads: + seq_pos = 0 + read_id, chr1, chr2, before, left_pos, right_pos, cigars, seq, qual, mismatch, left = read[:-1] + space_len = 0 + if before: + space_len = abs(left_pos - begin_pos) + else: + space_len = abs(pos1_ref - begin_pos) + 2 + if dir_ref[1] == "f": + space_len += (left_pos - pos2_ref) + else: + space_len += (pos2_ref - left_pos) + + qual_str = seq_str = ' ' * space_len + + saw_fusion = False + for cigar in cigars: + length = int(cigar[:-1]) + cigar_op = cigar[-1] + + if cigar_op in "Mm": + seq_str += seq[seq_pos:seq_pos + length] + qual_str += qual[seq_pos:seq_pos + length] + seq_pos += length + + if cigar_op in "Nn": + seq_str += ('|' * length) + qual_str += ('|' * length) + + if cigar_op in "Dd": + seq_str += ("D" * length) + qual_str += ("D" * length) + + if cigar_op in "Ii": + seq_pos += length + + if cigar_op in "F": + seq_str += " " + qual_str += " " + + cigar_str = ''.join(cigars) + prefix_str = '%s %s %d %d %s' % (chr1, chr2, left_pos, right_pos, cigar_str) + prefix_str += ' ' * (60 - len(prefix_str)) + + L = "L" + if not left: + L = "R" + + begin, end = max(0, dist_to_breakpoint - within), dist_to_breakpoint + within + output = '%s %s\n' % (prefix_str, seq_str[begin:end]) + alignments_file.write(output) + + alignments_file.close() + + + cigar_re = re.compile('\d+\w') + within = 300 + + old_chr = "" + reads_list = [[] for i in range(len(alignments_list))] + reads_compress_list = [[1, 0] for i in range(len(alignments_list))] + + cmd = ['samtools', 'view', '-h', bam] + popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=open('/dev/null')) + + contigs = {} + for line in popen.stdout: + if line[0] == '@': + if line[1:3] == 'SQ': + line = line[:-1].split('\t') + contig = line[1].split(':')[1] + if contig not in contigs: + contigs[contig] = len(contigs) + + continue + + line = line[:-1].split('\t') + + read_id, chr1, left_pos, cigar_str, seq, qual = line[0], line[2], int(line[3]) - 1, line[5], line[9], line[10] + + if chr1 != old_chr: + old_chr = chr1 + # print "%s - %s" % (sample_name, chr1) + + secondary_fusion_alignment = False + num_hits = 0 + for field in line[10:]: + if field[:5] == "XF:Z:": + if field[5] == '2': + secondary_fusion_alignment = True + + chr1, left_pos, cigar_str, seq, qual = field[7:].split() + left_pos = int(left_pos) - 1 + + if field[:5] == "NM:i:": + mismatch = int(field.split(':')[-1]) + + if field[:5] == "NH:i:": + num_hits = int(field.split(':')[-1]) + + if num_hits > params.fusion_multireads: + continue + + if secondary_fusion_alignment: + continue + + flag = int(line[1]) + left = (flag & 64 != 0) + antisense = (flag & 16 != 0) + + if '-' in chr1: + chr1, chr2 = chr1.split('-') + else: + chr2 = chr1 + + right_pos = left_pos + cigars = cigar_re.findall(cigar_str) + dir = "ff" + saw_fusion = False + for cigar in cigars: + length = int(cigar[:-1]) + cigar_op = cigar[-1] + + if cigar_op in "MDN": + right_pos += length + + if saw_fusion: + dir = dir[0] + "f" + else: + dir = "f" + dir[1] + + if cigar_op in "mdn": + right_pos -= length + + if saw_fusion: + dir = dir[0] + "r" + else: + dir = "r" + dir[1] + + if cigar_op in "F": + if dir[0] == "f": + pos1 = right_pos - 1 + else: + pos1 = right_pos + 1 + + pos2 = right_pos = length - 1 + saw_fusion = True + + if cigar_op in "iIdD": + mismatch -= length + + if not saw_fusion: + report_idx = -1 + for i in range(len(alignments_list)): + chr_ref, pos1_ref, pos2_ref = alignments_list[i][:3] + chr1_ref, chr2_ref = chr_ref.split('-') + + """ + # a fusion point can be given like "chr9-chr1" + if (contigs[chr1_ref] < contigs[chr1] or (contigs[chr1_ref] == contigs[chr1] and left_pos - pos1_ref > 1000000)) and \ + (contigs[chr2_ref] < contigs[chr1] or (contigs[chr2_ref] == contigs[chr1] and left_pos - pos2_ref > 1000000)): + report_idx = i + else: + break + """ + + if report_idx >= 0: + for j in range(report_idx+1): + output_region(sample_name, alignments_list[j], reads_list[j]) + + alignments_list = alignments_list[report_idx+1:] + reads_list = reads_list[report_idx+1:] + + if len(alignments_list) <= 0: + break + + fusion_read = saw_fusion + if mismatch > params.fusion_read_mismatches: + continue + + for i in range(len(alignments_list)): + chr_ref, pos1_ref, pos2_ref, dir_ref = alignments_list[i] + chr1_ref, chr2_ref = chr_ref.split('-') + + def reverse_read(fusion_read, dir, antisense, left_pos, right_pos, fusion_left, seq, qual, cigars): + if dir == 'ff' or not fusion_read: + left_pos, right_pos = right_pos - 1, left_pos - 1 + elif dir == 'fr': + left_pos, right_pos = right_pos + 1, left_pos - 1 + elif dir == 'rf': + left_pos, right_pos = right_pos - 1, left_pos + 1 + elif dir == 'rr': + left_pos, right_pos = right_pos + 1, left_pos + 1 + + reversed_cigars = [] + cigars.reverse() + for i in range(len(cigars)): + cigar = cigars[i] + opcode = cigar[-1] + + if opcode == 'F': + reversed_cigars.append("%dF" % fusion_left) + else: + if str.islower(opcode): + opcode = str.upper(opcode) + else: + opcode = str.lower(opcode) + + cigar = cigar[:-1] + opcode + reversed_cigars.append(cigar) + + if dir == 'ff': + dir = 'rr' + elif dir == 'rr': + dir = 'ff' + + return dir, antisense == False, left_pos, right_pos, reverse_complement(seq), qual[::-1], reversed_cigars + + if fusion_read and chr1 == chr2_ref and chr2 == chr1_ref and pos1 == pos2_ref and pos2 == pos1_ref: + dir, antisense, left_pos, right_pos, seq, qual, cigars = \ + reverse_read(fusion_read, dir, antisense, left_pos, right_pos, pos1, seq, qual, cigars) + + chr1, chr2 = chr2, chr1 + pos1, pos2 = pos2, pos1 + + if (fusion_read and chr1 == chr1_ref and chr2 == chr2_ref) or \ + (not fusion_read and (chr1 == chr1_ref or chr1 == chr2_ref)): + if not fusion_read: + do_not_continue = False + if chr1 == chr1_ref: + if abs(left_pos - pos1_ref) <= 10000 or abs(right_pos - pos1_ref) <= 10000: + do_not_continue = True + + if chr1 == chr2_ref: + if abs(left_pos - pos2_ref) <= 10000 or abs(right_pos - pos2_ref) <= 10000: + do_not_continue = True + + if not do_not_continue: + continue + + seq_temp = seq + qual_temp = qual + cigars_temp = cigars + left_pos_temp, right_pos_temp = left_pos, right_pos + + before = True + if fusion_read: + if dir != dir_ref: + continue + + if pos1 != pos1_ref or pos2 != pos2_ref: + continue + else: + do_not_continue = False + if chr1 == chr1_ref: + if dir_ref[0] == "f" and not (right_pos - pos1_ref >= 10 or pos1_ref - right_pos >= within): + do_not_continue = True + if dir_ref[0] == "r" and not (pos1_ref - left_pos >= 10 or left_pos - pos1_ref >= within): + do_not_continue = True + + if chr1 == chr2_ref: + if dir_ref[1] == "f" and not (pos2_ref - left_pos >= 10 or left_pos - pos2_ref >= within): + do_not_continue = True + before = False + if dir_ref[1] == "r" and not (right_pos - pos2_ref >= 10 or pos2_ref - right_pos >= within): + do_not_continue = True + before = False + + if not do_not_continue: + continue + + if (before and dir_ref[0] == "r") or (not before and dir_ref[1] == "r"): + dir, antisense, left_pos_temp, right_pos_temp, seq_temp, qual_temp, cigars_temp = \ + reverse_read(fusion_read, dir, antisense, left_pos_temp, right_pos_temp, 0, seq_temp, qual_temp, cigars_temp) + + reads_list[i].append([read_id, chr1, chr2, before, left_pos_temp, right_pos_temp, cigars_temp, seq_temp, qual_temp, mismatch, left, line]) + reads_compress_list[i][1] += 1 + + + for i in range(len(alignments_list)): + output_region(sample_name, alignments_list[i], reads_list[i]) + + print >> sys.stderr, "[%s] Generating read distributions around fusions" % right_now() + + file_name = output_dir + "potential_fusion.txt" + file = open(file_name, 'r') + + alignments_dir = output_dir + "read_alignments" + alignments_list = {} + line_no = 0 + for line in file: + if line_no % 6 == 0: + temp_list = line[:-1].split(' ') + sample_name = temp_list[0] + + if not sample_name in alignments_list: + alignments_list[sample_name] = [] + + if not os.path.exists("%s/%s_%s" % (alignments_dir, sample_name, "_".join(temp_list[1:5]))): + alignments_list[sample_name].append(' '.join(temp_list[1:5])) + + line_no += 1 + + file.close() + + if not os.path.exists(alignments_dir): + os.system("mkdir %s" % alignments_dir) + + pids = [0 for i in range(params.num_threads)] + + for sample_name, list in alignments_list.items(): + bam_file_name = 'tophat_%s/accepted_hits.bam' % sample_name + if not os.path.exists(bam_file_name): + continue + + increment = 50 + for i in range((len(list) + increment - 1) / increment): + temp_list = list[i*increment:(i+1)*increment] + print >> sys.stderr, '\t%s (%d-%d)' % (sample_name, i*increment + 1, min((i+1)*increment, len(list))) + + alignments_list = [] + for fusion in temp_list: + print >> sys.stderr, '\t\t%s' % fusion + fusion = fusion.split() + alignments_list.append([fusion[0], int(fusion[1]), int(fusion[2]), fusion[3]]) + + def work(): + if len(alignments_list) > 0: + alignments_region(sample_name, bam_file_name, alignments_list) + + if params.num_threads <= 1: + work() + else: + parallel_work(pids, work) + + if params.num_threads > 1: + wait_pids(pids) + + + +def generate_html(params): + def blast_output(database, seq): + blast_output_filename = "%s/%s" % (database, seq) + if os.path.exists(blast_output_filename): + file = open(blast_output_filename, "r") + return file.read() + "\n" + + return "" + + def read_fusion_genes(fusion_gene_list): + sample_names = [] + sample_list_filename = output_dir + 'sample_list.txt' + if not os.path.exists(sample_list_filename): + return + + sample_list_file = open(sample_list_filename, "r") + for line in sample_list_file: + sample_names.append(line[:-1]) + sample_list_file.close() + + for sample_name in sample_names: + sample_isoform_filename = "tophat_" + sample_name + "/transfuse.txt" + if not os.path.exists(sample_isoform_filename): + continue + + fusion_gene_begin_index = len(fusion_gene_list) + sample_isoform_file = open(sample_isoform_filename, "r") + for line in sample_isoform_file: + line = line[:-1].lstrip() + fields = line.split('\t') + + def read_coords(value_dic, value_str): + chr1, chr2 = "", "" + if ' ' in value_str: + chr1, chr2 = value_str.split(' ') + else: + chr1 = value_str + + chr1, chr1_coord = chr1.split(':') + chr1_left, chr1_right = chr1_coord.split('-') + chr1_left, chr1_right = int(chr1_left), int(chr1_right) + value_dic["chr1"] = chr1 + value_dic["chr1_left"] = chr1_left + value_dic["chr1_right"] = chr1_right + + if chr2: + chr2, chr2_coord = chr2.split(':') + chr2_left, chr2_right = chr2_coord.split('-') + chr2_left, chr2_right = int(chr2_left), int(chr2_right) + value_dic["chr2"] = chr2 + value_dic["chr2_left"] = chr2_left + value_dic["chr2_right"] = chr2_right + + def read_values(value_dic, value_str): + values = fields[-1].split(';') + + for value in values: + value = value.strip() + if not value: + continue + + name, value = value.split(' ') + + value_dic[name] = value[1:-1] + + type = fields[0] + if type == "gene": + value_dic = {} + read_coords(value_dic, fields[1]) + read_values(value_dic, fields[-1]) + + fusion_gene_list.append([sample_name, value_dic, []]) + + elif type == "transcript": + value_dic = {} + read_coords(value_dic, fields[1]) + read_values(value_dic, fields[-1]) + + # daehwan - for debugging purposes + if float(value_dic["FPKM"]) >= 1.0: + skip_transcript = False + curr_gene = fusion_gene_list[-1] + transcript_list = curr_gene[-1] + transcript_list.append([value_dic, []]) + else: + skip_transcript = True + + elif type in ["exon", "fusion"]: + if skip_transcript: + continue + + curr_gene = fusion_gene_list[-1] + transcript_list = curr_gene[-1] + curr_transcript = transcript_list[-1] + exon_list = curr_transcript[-1] + + if type == "exon": + chr, chr_coord = fields[1].split(':') + chr_left, chr_right = chr_coord.split('-') + chr_left, chr_right = int(chr_left), int(chr_right) + exon_list.append(["exon", chr, chr_left, chr_right]) + else: + fusion_left, fusion_right = fields[1].split(' ') + chr1, chr1_pos = fusion_left.split(':') + chr2, chr2_pos = fusion_right.split(':') + + chr1_pos, chr2_pos = int(chr1_pos), int(chr2_pos) + exon_list.append(["fusion", chr1, chr1_pos, chr2, chr2_pos]) + + transcript_coverage_dic = {} + sample_isoform_cov_filename = "tophat_" + sample_name + "/transfuse_cov.txt" + if os.path.exists(sample_isoform_cov_filename): + sample_isoform_cov_file = open(sample_isoform_cov_filename, "r") + for line in sample_isoform_cov_file: + if line[0] == "C": + transcript_id = line.split("\t")[0] + transcript_coverage_dic[transcript_id] = [] + else: + last_value = line.strip().split()[-1] + base_cov_float = float(last_value) + if math.isnan(base_cov_float): + base_cov_float = 0.1 + base_cov = int(base_cov_float + 0.99) + transcript_coverage_dic[transcript_id].append(base_cov) + + sample_isoform_cov_file.close() + + + for fusion_gene in fusion_gene_list[fusion_gene_begin_index:]: + sample_name, gene_value_dic, transcripts = fusion_gene + max_base_cov = 0 + + saw_fusion_transcript = False + for transcript in transcripts: + transcript_value_dic, exons = transcript + transcript_id = transcript_value_dic["transcript_id"] + + if transcript_id in transcript_coverage_dic: + transcript_coverage = transcript_coverage_dic[transcript_id] + else: + transcript_coverage = None + + saw_fusion = False + exon_offset = 0 + for exon in exons: + if exon[0] == "fusion": + saw_fusion = True + continue + + exon_length = exon[3] - exon[2] + 1 + + if transcript_coverage: + coverage = transcript_coverage[exon_offset:exon_offset + exon_length] + else: + coverage = [0 for z in range(exon_length)] + + exon.append(coverage) + + for base_cov in exon[-1]: + if base_cov > max_base_cov: + max_base_cov = base_cov + + exon_offset += exon_length + + transcript_value_dic["is_fusion_transcript"] = saw_fusion + if saw_fusion: + saw_fusion_transcript = True + + if not saw_fusion and saw_fusion_transcript: + transcript_value_dic["is_after_fusion"] = True + else: + transcript_value_dic["is_after_fusion"] = False + + for transcript in transcripts: + transcript_value_dic, exons = transcript + transcript_value_dic["max_base_cov"] = max_base_cov + + sample_isoform_file.close() + + def find_fusion_gene(fusion_gene_list, sample_name, chr1, chr1_coord, chr2, chr2_coord): + # slow linear search through fusion_gene_list + fusion_gene_info = "" + for fusion_gene in fusion_gene_list: + _sample_name = fusion_gene[0] + value_dic = fusion_gene[1] + _chr1, _chr1_left, _chr1_right, _chr2, _chr2_left, _chr2_right = \ + value_dic["chr1"], value_dic["chr1_left"], value_dic["chr1_right"], \ + value_dic["chr2"], value_dic["chr2_left"], value_dic["chr2_right"] + + if sample_name != _sample_name: + continue + + if chr1 != _chr1 or chr2 != _chr2: + continue + + if chr1_coord < _chr1_left - 2 or chr1_coord > _chr1_right + 2: + continue + + if chr2_coord < _chr2_left -2 or chr2_coord > _chr2_right + 2: + continue + + transcripts = fusion_gene[-1] + for transcript in transcripts: + exons = transcript[-1] + for exon in exons: + if exon[0] != "fusion": + continue + + if chr1_coord == exon[2] and chr2_coord == exon[4]: + return fusion_gene + + return None + + + def read_fusion_list(fusion_list): + re_find_chromosomes = re.compile(r'Homo sapiens chromosome (\d+|[XY])') + re_find_identities = re.compile(r'Identities = (\d+)\/\d+ \((\d+)%\)') + re_find_exon = re.compile(r'exon\d+\((\d+-\d+)\)') + + line_no = 0 + do_not_add = False + output = {} + file_name = output_dir + "potential_fusion.txt" + file = open(file_name, 'r') + for line in file: + if line_no % 6 == 0: + if output and not do_not_add: + fusion_list.append(output) + + do_not_add = False + temp_list = line[:-1].split(' ') + sample_name = temp_list[0] + + chr = temp_list[1] + chr1, chr2 = chr.split('-') + left = int(temp_list[2]) + right = int(temp_list[3]) + dir = temp_list[4] + + output = {} + output["sample_name"] = sample_name + output["chr"] = chr + output["chr1"] = chr1 + output["chr2"] = chr2 + output["left_coord"] = left + output["right_coord"] = right + output["dir"] = dir + output["stats"] = temp_list[5:] + + elif line_no % 6 == 1: + output["left_seq"] = line[:-1].split(" ")[0] + + elif line_no % 6 == 2: + output["right_seq"] = line[:-1].split(" ")[1] + + left_seq = output["left_seq"] + right_seq = output["right_seq"] + seq = left_seq + right_seq + + temp_output = blast_output(output_dir + "blast_genomic", left_seq) + left_chromosomes = set(re_find_chromosomes.findall(temp_output)) + + temp_output = blast_output(output_dir + "blast_genomic", right_seq) + right_chromosomes = set(re_find_chromosomes.findall(temp_output)) + + if string.find(output["sample_name"], "single") == -1: + chr1, chr2 = output["chr"].split("-") + if chr1 != chr2 and left_chromosomes & right_chromosomes: + do_not_add = True + + temp_output = blast_output(output_dir + "blast_genomic", seq) + blast_output(output_dir + "blast_nt", seq) + for identity in re_find_identities.findall(temp_output): + query = int(identity[0]) + percent = int(identity[1]) + if query + percent > 160: + do_not_add = True + break + + elif line_no % 6 == 3: + output["depth"] = line[:-1] + + elif line_no % 6 == 4: + output["gene"] = line[:-1] + temp = output["gene"].split() + output["gene1"], output["gene2"] = temp[0], temp[2] + + else: + pair_str = line.strip() + if len(pair_str) > 0: + pairs = pair_str.split(" ") + else: + pairs = [] + + output["pair_coords"] = pairs + + read_output = "" + read_output_file = output_dir + "read_alignments/%s_%s_%d_%d_%s" % (output["sample_name"], output["chr"], + output["left_coord"], output["right_coord"], output["dir"]) + + if not do_not_add and os.path.exists(read_output_file): + read_output = open(read_output_file, "r").read() + + def unique(list, left, right): + color_len = 300 + result, lcolor, rcolor = [], [0 for i in range(color_len)], [0 for i in range(color_len)] + keys = [] + F_start, F_end = 1000000, 0 + for item in list: + key = item.split(" ") + if len(key) < 6: + continue + + chr1, chr2, pos1, pos2, cigars = key[:5] + pos1, pos2 = int(pos1), int(pos2) + + def color(list, left, right): + for i in range(left, right): + if i >= len(list): + break + + list[i] += 1 + + if 'F' in cigars: + dist1 = abs(pos1 - left) + dist2 = abs(pos2 - right) + + color(lcolor, 0, dist1) + color(rcolor, 0, dist2) + + elif abs(pos2 - left) < color_len: + color(lcolor, abs(pos2 - left), abs(pos1 - left)) + + elif abs(pos1 - right) < color_len: + color(rcolor, abs(pos1 - right), abs(pos2 - right)) + + if "F" in key[4] or len(keys) == 0 or key[:2] != keys[-1][:2] or abs(int(key[2]) - int(keys[-1][2])) >= 3: + keys.append(key[:3]) + result.append(item) + + if "F" in key[4]: + index = len(result) - 1 + if index < F_start: + F_start = index + + if F_end < index: + F_end = index + + if F_end == 0: + F_start, F_end = 0, len(result) + + temp_result = result[max(0, F_start - 50):(F_end+50)] + result = [] + for i in range(len(temp_result)): + item = temp_result[i].split(" ") + if i == 0: + trim = 0 + for ch in item[40:-1]: + trim += 1 + + item = item[:40] + item[40 + trim:] + result.append(" ".join(item)) + + return result, lcolor, rcolor + + if len(read_output) < 1024 * 1024 * 1024: + read_output = read_output.split("\n") + read_output, lcolor, rcolor = unique(read_output, output["left_coord"], output["right_coord"]) + + def stat(lcolor, rcolor, dist): + lcount, lsum, rcount, rsum = 1, 0, 1, 0 + lgap, lpass, rgap, rpass = 0, False, 0, False + for i in range(dist): + if lcolor[i] > 0: + lcount += 1 + lsum += lcolor[i] + + if lgap > 0: + lpass = True + else: + if not lpass: + lgap += 1 + + if rcolor[i] > 0: + rcount += 1 + rsum += rcolor[i] + + if rgap > 0: + rpass = True + else: + if not rpass: + rgap += 1 + + if not lpass: + lgap = 0 + + if not rpass: + rgap = 0 + + return lcount, lsum / lcount, lgap, rcount, rsum / rcount, rgap + + temp = output["gene"].split() + gene1_loc, gene2_loc = temp[1], temp[3] + + def find_exon_len(gene_loc, coord, dir, is_left): + exon_len = 1000000 + for loc in re_find_exon.findall(gene_loc): + start, end = loc.split('-') + start, end = int(start), int(end) + if (is_left and dir == 'f') or (not is_left and dir == 'r'): + exon_len = coord - start + 1 + else: + exon_len = end - coord + 1 + return exon_len + + return exon_len + + lcount_min, rcount_min, diff_max = 150, 150, 120 + + lcount_exon_min = find_exon_len(gene1_loc, output["left_coord"], output["dir"][0], True) + rcount_exon_min = find_exon_len(gene2_loc, output["right_coord"], output["dir"][1], False) + + lcount_min = min(lcount_min, lcount_exon_min - 20) + rcount_min = min(rcount_min, rcount_exon_min - 20) + diff_max = min(diff_max, abs(lcount_min - rcount_min) + 20) + + if lcount_exon_min < 1000 and rcount_exon_min < 1000: + diff_max = max(diff_max, abs(lcount_exon_min - rcount_exon_min) + 20) + + lcount, lavg, lgap, rcount, ravg, rgap = stat(lcolor, rcolor, len(lcolor)) + if lcount <= lcount_min or rcount <= rcount_min or lgap / lcount > 0.1 or rgap / rcount > 0.1: + if abs(min(lcount, lcount_exon_min) - min(rcount, rcount_exon_min)) > diff_max or lcount < 60 or rcount < 60: + do_not_add = True + + def derivation(color, length, avg): + der = 0.0 + for i in range(length): + diff = 1.0 - float(color[i]) / float(max(1, avg)) + der += diff * diff + + return math.sqrt(der / max(1, length)) + + output["read_output"] = read_output + output["lcount"] = lcount + output["lavg"] = lavg + output["lgap"] = lgap + output["lder"] = lder = derivation(lcolor, lcount_min, lavg) + output["rcount"] = rcount + output["ravg"] = ravg + output["rgap"] = rgap + output["rder"] = rder = derivation(rcolor, rcount_min, ravg) + + stats = output["stats"] + num_read = int(stats[0]) + pair = int(stats[1]) + pair_fusion = int(stats[2]) + anti_read = int(stats[3]) + anti_pair = int(stats[4]) + pair_coords = output["pair_coords"] + + dist = 1000000 + if len(pair_coords) > 0: + pair = 0 + for pair_coord in pair_coords: + pair_coord = pair_coord.split(":") + temp_dist = abs(int(pair_coord[0])) + abs(int(pair_coord[1])) + if temp_dist < dist: + dist = temp_dist + + if temp_dist < 2000: + pair += 1 + + + anti_read += 0.5 + if pair == 0: + rate = num_read / anti_read + else: + rate = pair / anti_read + + fusion_gene = find_fusion_gene(fusion_gene_list, \ + output["sample_name"], \ + output["chr1"], output["left_coord"], \ + output["chr2"], output["right_coord"]) + + output["assembled"] = False + fusion_dic = {} + left_normal_transcript, right_normal_transcript = 0, 0 + if fusion_gene: + sample_name, gene_value_dic, transcripts = fusion_gene + for transcript in transcripts: + transcript_value_dic, exons = transcript + if not transcript_value_dic["is_fusion_transcript"]: + if transcript_value_dic["is_after_fusion"]: + right_normal_transcript = 1 + else: + left_normal_transcript = 1 + + continue + + for exon in exons: + if exon[0] == "fusion": + fusion_item = "%d-%d" % (exon[2], exon[4]) + + if output["left_coord"] == exon[2] and output["right_coord"] == exon[4]: + assembled = True + output["assembled"] = True + + if fusion_item not in fusion_dic: + fusion_dic[fusion_item] = 0 + + fusion_dic[fusion_item] += 1 + + + num_fusions = 0 + num_alternative_splicing_with_same_fusion = 0 + + for key, value in fusion_dic.items(): + num_fusions += 1 + num_alternative_splicing_with_same_fusion += (value - 1) + + # lcount and rcount both are in [0, 300] + max_avg = 300 + score = lcount + rcount + (min(max_avg, lavg) + min(max_avg, ravg)) - abs(lcount - rcount) \ + - min(max_avg, abs(lavg - ravg)) - (lgap + rgap) - (lder + rder) * max_avg - min(dist, 1000) \ + + rate + + if output["assembled"]: + score += 300 + min(num_fusions - 1, 5) * 200 + min(num_alternative_splicing_with_same_fusion, 5) * 100 \ + + left_normal_transcript * 50 + right_normal_transcript * 50 + + output["score"] = score + output["test"] = "lcount: %d, lavg: %d, lgap: %d, lder: %f, rcount: %d, ravg: %d, rgap: %d, rder: %f, dist: %d, score: %f num_fusions: %d num_alternative_splicing_with_same_fusion: %d left_normal_transcript: %d right_normal_transcript: %d" % \ + (lcount, lavg, lgap, lder, rcount, ravg, rgap, rder, dist, score, num_fusions, num_alternative_splicing_with_same_fusion, left_normal_transcript, right_normal_transcript) + + else: + read_output = [r"too many - not shown"] + + if not "read_output" in output: + output["read_output"] = "" + output["lcount"] = 0 + output["lavg"] = 0 + output["lgap"] = 0 + output["lder"] = 0 + output["rcount"] = 0 + output["ravg"] = 0 + output["rgap"] = 0 + output["rder"] = 0 + output["score"] = 0 + output["assembled"] = False + + line_no += 1 + + if output and not do_not_add: + fusion_list.append(output) + + file.close() + + + def cluster_fusion(fusion_list, fusion_gene_list, cluster_list): + cluster_dist = 500000 + + cluster_temp_list = [] + parent = [i for i in range(len(fusion_list))] + + for i in range(len(fusion_list)): + fusion = fusion_list[i] + + cluster = {} + cluster["index"] = [i] + cluster["chr"] = fusion["chr"] + cluster["left1"] = cluster["left2"] = fusion["left_coord"] + cluster["right1"] = cluster["right2"] = fusion["right_coord"] + cluster["dir"] = fusion["dir"] + + cluster_temp_list.append(cluster) + + def parent_index(parent, i): + parent_i = parent[i] + if i == parent_i: + return i + else: + parent[i] = parent[parent_i] + return parent_index(parent, parent[i]) + + for i in range(len(fusion_list) - 1): + parent_i = parent_index(parent, i) + for j in range(i+1, len(fusion_list)): + parent_j = parent_index(parent, j) + + if parent_i == parent_j: + continue + + cluster_i = cluster_temp_list[parent_i] + cluster_j = cluster_temp_list[parent_j] + + if cluster_i["chr"] != cluster_j["chr"] or cluster_i["dir"] != cluster_j["dir"]: + continue + + i_left1 = cluster_i["left1"] + i_left2 = cluster_i["left2"] + + j_left1 = cluster_j["left1"] + j_left2 = cluster_j["left2"] + + if abs(i_left1 - j_left1) > cluster_dist or abs(i_left2 - j_left2) > cluster_dist or \ + abs(i_left2 - j_left1) > cluster_dist or abs(i_left1 - j_left2) > cluster_dist: + continue + + i_right1 = cluster_i["right1"] + i_right2 = cluster_i["right2"] + + j_right1 = cluster_j["right1"] + j_right2 = cluster_j["right2"] + + if abs(i_right1 - j_right1) > cluster_dist or abs(i_right2 - j_right2) > cluster_dist or\ + abs(i_right2 - j_right1) > cluster_dist or abs(i_right1 - j_right2) > cluster_dist: + continue + + cluster_i["left1"] = min(i_left1, j_left1) + cluster_i["left2"] = max(i_left2, j_left2) + + cluster_i["right1"] = min(i_right1, j_right1) + cluster_i["right2"] = max(i_right2, j_right2) + + parent[j] = i + cluster_i["index"].extend(cluster_j["index"]) + + + cluster_temp_list2 = [] + for i in range(len(parent)): + if i == parent[i]: + cluster_temp_list2.append(cluster_temp_list[i]) + + def num_samples(indices): + samples = [] + for i in indices: + sample = fusion_list[i]["sample_name"] + if not sample in samples: + samples.append(sample) + + return len(samples) + + def cmp(a, b): + def pair_count(indices): + final_score = -1000000.0 + for index in indices: + score = fusion_list[index]["score"] + if score > final_score: + final_score = score + + return int(final_score) + + a_indices = a["index"] + b_indices = b["index"] + + a_gene, b_gene = 0, 0 + def known_genes(indices): + num = 0 + for index in indices: + temp_num = 0 + fusion = fusion_list[index] + + if fusion["gene1"] != "N/A": + temp_num += 1 + + if fusion["gene2"] != "N/A": + temp_num += 1 + + if temp_num > num: + num = temp_num + + return num + + a_gene = known_genes(a_indices) + b_gene = known_genes(b_indices) + + if a_gene != b_gene: + return b_gene - a_gene + + a_num_sample, b_num_sample = num_samples(a_indices), num_samples(b_indices) + a_score = pair_count(a_indices) + b_score = pair_count(b_indices) + + return b_score - a_score + + cluster_temp_list = sorted(cluster_temp_list2, cmp=cmp) + + for i in range(min(params.max_num_fusions, len(cluster_temp_list))): + do_not_add = False + indices = cluster_temp_list[i]["index"] + if not do_not_add: + def cmp(a, b): + return int(fusion_list[b]["score"] - fusion_list[a]["score"]) + + cluster_temp_list[i]["index"] = sorted(cluster_temp_list[i]["index"], cmp=cmp) + cluster_list.append(cluster_temp_list[i]) + + def generate_html_impl(fusion_list, cluster_list, fusion_gene_list): + html_file_name = output_dir + 'result.html' + txt_file_name = output_dir + 'result.txt' + html_file = open(html_file_name, 'w') + txt_file = open(txt_file_name, 'w') + + html_prev = [] + javascript = [] + html_body = [] + html_post = [] + + def cmp(i, j): + i_left = fusion_list[i]["left_coord"] + j_left = fusion_list[j]["left_coord"] + if i_left == j_left: + return i - j + else: + return i_left - j_left + + indices_list = [] + for c in cluster_list: + indices_list += c["index"] + + indices_list = sorted(indices_list) + print >> sys.stderr, "\tnum of fusions: %d" % len(cluster_list) + + if params.tex_table: + tex_table_file_name = output_dir + 'table.tex' + tex_table_file = open(tex_table_file_name, 'w') + print >> tex_table_file, r'\documentclass{article}' + print >> tex_table_file, r'\usepackage{graphicx}' + print >> tex_table_file, r'\begin{document}' + print >> tex_table_file, r'\pagestyle{empty}' + print >> tex_table_file, r"\center{\scalebox{0.7}{" + print >> tex_table_file, r"\begin{tabular}{| c | c | c | c | c | c | c |}" + print >> tex_table_file, r"\hline" + print >> tex_table_file, r"SAMPLE ID & Fusion genes (left-right) & Chromosomes (left-right) & " + \ + r"5$'$ position & 3$'$ position & Spanning reads & Spanning pairs \\" + + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'result') + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'') + html_prev.append(r'') + html_body.append(r'') + + html_post.append(r'


Candidate fusion list

') + html_post.append(r'The following tables show fusion candidates where fusions are grouped based on their genomic locations (table description).
') + + num_fusions = 0 + for c in range(len(cluster_list)): + cluster = cluster_list[c] + indices = cluster["index"] + + num_fusions += len(indices) + + top_index = indices[0] + indices = sorted(indices, cmp=cmp) + + chr1, chr2 = cluster["chr"].split('-') + + if params.tex_table: + print >> tex_table_file, r'\hline' + + html_post.append(r'


') + html_post.append(r'%d. %s %s' % (c+1, cluster["chr"], cluster["dir"])) + + html_post.append(r'') + for i in indices: + fusion = fusion_list[i] + sample_name = fusion["sample_name"] + # sample_name = string.split(sample_name, '_')[0] + + output = "" + stats = fusion["stats"] + stats = [int(stat) for stat in stats] + + if stats[1] > 0: + pair_support_html = r"\htmlref{%d}{pair_%d}" % (stats[1], i) + else: + pair_support_html = r"0" + + if params.tex_table: + if sample_name == "lane1": + sample_name2 = "thyroid" + else: + sample_name2 = "testes" + + print >> tex_table_file, r'%s & %s & %s & %d & %d & %d & %d \\' % \ + (sample_name2, + fusion["gene1"] + "-" + fusion["gene2"], + chr1[3:] + "-" + chr2[3:], + fusion["left_coord"], + fusion["right_coord"], + stats[0], + stats[1] + stats[2]) + + print >> txt_file, '%s\t%s\t%s\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%.2f' % \ + (sample_name, + fusion["gene1"], + chr1, + fusion["left_coord"], + fusion["gene2"], + chr2, + fusion["right_coord"], + stats[0], + stats[1], + stats[2], + fusion["score"]) + + html_post.append(r'' % (i, sample_name)) + html_post.append(r'' % fusion["gene1"]) + html_post.append(r'' % chr1) + html_post.append(r'' % fusion["left_coord"]) + html_post.append(r'' % fusion["gene2"]) + html_post.append(r'' % chr2) + html_post.append(r'' % fusion["right_coord"]) + html_post.append(r'' % (i, stats[0])) + html_post.append(r'' % (i, stats[1])) + html_post.append(r'' % stats[2]) + + # daehwan - for debugging purposes + # html_post.append(r'' % fusion["score"]) + # html_post.append(r'' % fusion["test"]) + + fusion_gene = find_fusion_gene(fusion_gene_list, sample_name, chr1, fusion["left_coord"], chr2, fusion["right_coord"]) + fusion["fusion_gene"] = fusion_gene + if fusion_gene and fusion["assembled"]: + html_post.append(r'' % i) + + html_post.append(r'') + + + html_post.append(r'
%s%s%s%d%s%s%d%d%d%d%.2f%sisoforms
') + + """ + # sample + html_post.append(r'


sample list

') + html_post.append(r'') + html_post.append(r'') + html_post.append(r'') + html_post.append(r'') + html_post.append(r'') + + if os.path.exists("sample_info.txt"): + sample_file = open("sample_info.txt", "r") + for line in sample_file: + line = line[:-1].split("\t") + html_post.append(r'' % line[0]) + html_post.append(r'' % line[1]) + html_post.append(r'' % line[2]) + html_post.append(r'' % line[3]) + + sample_file.close() + + html_post.append(r'
sample namefragment lengthread length# of fragments
%s%s%s%s
') + """ + + # description + html_post.append(r'


table description

') + html_post.append(r'1. Sample name in which a fusion is identified
') + html_post.append(r'2. Gene on the "left" side of the fusion
') + html_post.append(r'3. Chromosome ID on the left
') + html_post.append(r'4. Coordinates on the left
') + html_post.append(r'5. Gene on the "right" side
') + html_post.append(r'6. Chromosome ID on the right
') + html_post.append(r'7. Coordinates on the right
') + html_post.append(r'8. Number of spanning reads - this is the number of reads that span a fusion point all on their own. In other words, the read itself has a fusion break point within it.
') + html_post.append(r'9. Number of spanning mate pairs - this is the number of pairs of reads where one read maps entirely on the left and the other read maps entirely on the right of the fusion break point. Neither read is split, so these pairs are not counted at all in (8).
') + html_post.append(r'10. Number of spanning mate pairs where one end spans a fusion (reads spanning fusion with only a few bases are included).
') + html_post.append(r'If you follow the the 9th column, it shows coordinates "number1:number2" where one end is located at a distance of "number1" bases from the left genomic coordinate of a fusion and "number2" is similarly defined.') + html_post.append(r'


') + + fusion_gene_drawn = [False for i in range(len(cluster_list))] + + fusion_gene_indices = [] + for i in indices_list: + cluster_index = -1 + for c in range(len(cluster_list)): + cluster_indices = cluster_list[c]["index"] + if i in cluster_indices: + cluster_index = c + break + + fusion = fusion_list[i] + sample_name = fusion["sample_name"] + chr = fusion["chr"] + left = fusion["left_coord"] + right = fusion["right_coord"] + dir = fusion["dir"] + + chr1, chr2 = chr.split("-") + + left_seq = fusion["left_seq"] + right_seq = fusion["right_seq"] + seq = left_seq + right_seq + + pair_coords = fusion["pair_coords"] + stats = fusion["stats"] + stats = [int(stat) for stat in stats] + + html_post.append(r'' % i) + html_post.append(r'' % (sample_name)) + html_post.append(r'' % fusion["gene1"]) + html_post.append(r'' % chr1) + html_post.append(r'' % fusion["left_coord"]) + html_post.append(r'' % fusion["gene2"]) + html_post.append(r'' % chr2) + html_post.append(r'' % fusion["right_coord"]) + html_post.append(r'' % (i, stats[0])) + html_post.append(r'' % (i, stats[1])) + html_post.append(r'' % stats[2]) + html_post.append(r'') + + dir_str = "" + if dir[0] == "f": + dir_str += "-------->" + else: + dir_str += "<--------" + + dir_str += " " + + if dir[1] == "f": + dir_str += "-------->" + else: + dir_str += "<--------" + + html_post.append(r'' % dir_str) + html_post.append(r'

%s

%s

%s

%d

%s

%s

%d

%d

%d

%d

%s

') + html_post.append(r'

') + + """ + html_post.append(r'
')
+            html_post.append(r'%s %s' % (left_seq, right_seq))
+            html_post.append(r'%s' % fusion["depth"])
+            html_post.append(r'%s' % fusion["gene"])
+            html_post.append(r'
') + """ + + fusion_gene = fusion["fusion_gene"] + if fusion_gene: + fusion_gene_indices.append([i, cluster_index]) + + fg_value_dic, transcripts = fusion_gene[1], fusion_gene[2] + html_post.append(r'

%s-%s

' % (i, fusion["gene1"], fusion["gene2"])) + + if not fusion_gene_drawn[cluster_index]: + javascript.append(r'') + + fusion_gene_drawn[cluster_index] = True + + left_blast_genomic = blast_output(output_dir + "blast_genomic", left_seq) + right_blast_genomic = blast_output(output_dir + "blast_genomic", right_seq) + blast_genomic = blast_output(output_dir + "blast_genomic", seq) + + left_blast_nt = blast_output(output_dir + "blast_nt", left_seq) + right_blast_nt = blast_output(output_dir + "blast_nt", right_seq) + blast_nt = blast_output(output_dir + "blast_nt", seq) + + html_post.append(r'') + html_post.append(r'
Left flanking sequenceRight flanking sequence
%s%s
' % (left_seq, right_seq)) + + html_post.append(r'

blast search - genome

') + html_post.append(r'

left flanking sequence - %s

' % left_seq) + html_post.append(r'
%s
' % left_blast_genomic) + html_post.append(r'

right flanking sequence - %s

' % right_seq) + html_post.append(r'
%s
' % right_blast_genomic) + + html_post.append(r'

blast search - nt

') + html_post.append(r'

left flanking sequence - %s

' % left_seq) + html_post.append(r'
%s
' % left_blast_nt) + html_post.append(r'

right flanking sequence - %s

' % right_seq) + html_post.append(r'
%s
' % right_blast_nt) + + html_post.append(r'


reads

' % i) + read_output = fusion["read_output"] + html_post.append(r'
%s
' % '\n'.join(read_output)) + + html_post.append(r'


%d pairs

' % (i, len(pair_coords))) + html_post.append(r'
%s
' % '\n'.join(pair_coords)) + html_post.append(r'


') + + html_post.append(r'') + html_post.append(r'') + + fusion_gene_draw_str = "" + for index in fusion_gene_indices: + isoform_index, cluster_index = index + fusion_gene_draw_str += "fusion_gene_draw%d(%d);" % (cluster_index, isoform_index) + + html_body.append(r'' % fusion_gene_draw_str) + + for line in html_prev + javascript + html_body + html_post: + print >> html_file, line + + html_file.close() + txt_file.close() + + if params.tex_table: + print >> tex_table_file, r"\hline" + print >> tex_table_file, r"\end{tabular}}}" + print >> tex_table_file, r'\end{document}' + tex_table_file.close() + os.system("pdflatex --output-directory=%s %s" % (output_dir, tex_table_file_name)) + + print >> sys.stderr, "[%s] Reporting final fusion candidates in html format" % right_now() + + # Cufflinks-Fusion + fusion_gene_list = [] + read_fusion_genes(fusion_gene_list) + + fusion_list = [] + read_fusion_list(fusion_list) + + cluster_list = [] + cluster_fusion(fusion_list, fusion_gene_list, cluster_list) + + generate_html_impl(fusion_list, cluster_list, fusion_gene_list) + + + +# Format a DateTime as a pretty string. +# FIXME: Currently doesn't support days! +def formatTD(td): + hours = td.seconds // 3600 + minutes = (td.seconds % 3600) // 60 + seconds = td.seconds % 60 + return '%02d:%02d:%02d' % (hours, minutes, seconds) + + +# Generate a new temporary filename in the user's tmp directory +def tmp_name(): + tmp_root = tmp_dir + if os.path.exists(tmp_root): + pass + else: + os.mkdir(tmp_root) + return tmp_root + os.tmpnam().split('/')[-1] + + +def die(msg=None): + if msg is not None: + print >> sys.stderr, msg + sys.exit(1) + + +# rough equivalent of the 'which' command to find external programs +# (current script path is tested first, then PATH envvar) +def which(program): + def is_executable(fpath): + return os.path.exists(fpath) and os.access(fpath, os.X_OK) + fpath, fname = os.path.split(program) + if fpath: + if is_executable(program): + return program + else: + progpath = os.path.join(bin_dir, program) + if is_executable(progpath): + return progpath + for path in os.environ["PATH"].split(os.pathsep): + progpath = os.path.join(path, program) + if is_executable(progpath): + return progpath + return None + + +def prog_path(program): + progpath = which(program) + if progpath == None: + die("Error locating program: "+program) + return progpath + + +def get_version(): + return "2.1.0-cgpPatched" + + +def main(argv=None): + warnings.filterwarnings("ignore", "tmpnam is a potential security risk") + + # Initialize default parameter values + params = TopHatFusionParams() + try: + if argv is None: + argv = sys.argv + args = params.parse_options(argv) + params.check() + + bwt_idx_prefix = args[0] + + print >> sys.stderr, "[%s] Beginning TopHat-Fusion post-processing run (v%s)" % (right_now(), get_version()) + print >> sys.stderr, "-----------------------------------------------" + + start_time = datetime.now() + prepare_output_dir() + sample_updated = check_samples() + + if not params.skip_fusion_kmer: + map_fusion_kmer(bwt_idx_prefix, params, sample_updated) + + if not params.skip_filter_fusion: + filter_fusion(bwt_idx_prefix, params) + + if not params.skip_blast: + do_blast(params) + + if not params.skip_read_dist: + read_dist(params) + + if not params.skip_html: + generate_html(params) + + global run_log + run_log = open(logging_dir + "run.log", "w", 0) + global run_cmd + run_cmd = " ".join(argv) + print >> run_log, run_cmd + + finish_time = datetime.now() + duration = finish_time - start_time + print >> sys.stderr,"-----------------------------------------------" + print >> sys.stderr, "[%s] Run complete [%s elapsed]" % (right_now(), formatTD(duration)) + + except Usage, err: + print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) + print >> sys.stderr, "\tfor detailed help see http://tophat-fusion.sourceforge.net/manual.html" + return 2 + + +if __name__ == "__main__": + sys.exit(main()) From f848d5132666d983d29940232174d253dc07bab1 Mon Sep 17 00:00:00 2001 From: Yaobo Xu Date: Fri, 11 Oct 2019 11:03:51 +0000 Subject: [PATCH 2/5] used rsplit instead of a function in tophat-fusion-post.patch, should be faster than before --- build/patches/tophat-fusion-post.patch | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/build/patches/tophat-fusion-post.patch b/build/patches/tophat-fusion-post.patch index d2a74b2..8dc3a2f 100644 --- a/build/patches/tophat-fusion-post.patch +++ b/build/patches/tophat-fusion-post.patch @@ -787,13 +787,8 @@ def filter_fusion(bwt_idx_prefix, params): max_intron_len = min(max_intron_len, abs(coord1 - coord2) * 9 / 10) same = False - - def split_chr_coord(chr_coord): - coord = chr_coord.split(':')[-1] - return chr_coord.rstrip(':' + coord), coord - for chr_coord in left_chrs: - chr, coord = split_chr_coord(chr_coord) + chr, coord = chr_coord.rsplit(':', 1) coord = int(coord) if chr == chr2 and abs(coord - coord2) < max_intron_len: same = True @@ -803,7 +798,7 @@ def filter_fusion(bwt_idx_prefix, params): continue for chr_coord in right_chrs: - chr, coord = split_chr_coord(chr_coord) + chr, coord = chr_coord.rsplit(':', 1) coord = int(coord) if chr == chr1 and abs(coord - coord1) < max_intron_len: same = True From cda6afd5b224debf61257a369694e587d710b4bf Mon Sep 17 00:00:00 2001 From: Yaobo Xu Date: Fri, 11 Oct 2019 11:11:46 +0000 Subject: [PATCH 3/5] bumped version, fixes #37 --- CHANGES.md | 4 ++++ Dockerfile | 2 +- cwls/tools/merge_and_mark_dups.cwl | 2 +- cwls/tools/run-cgprna_bigwig.cwl | 2 +- cwls/tools/run-cgprna_htseq-count.cwl | 2 +- cwls/tools/run-cgprna_star-map.cwl | 2 +- cwls/tools/run-cgprna_stats.cwl | 2 +- perl/lib/Sanger/CGP/CgpRna.pm | 2 +- 8 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index bde09f8..a01eb20 100755 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## 2.4.1 + +* added a patched topaht-fusion-post script, in which the contig name bug #37 is fixed + ## 2.4.0 * Revised Dockerfile so that the builder stage is properly used and the image size is reduced. Resolved #28. diff --git a/Dockerfile b/Dockerfile index 5e99827..c940bbb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,7 +90,7 @@ FROM ubuntu:16.04 LABEL maintainer="cgphelp@sanger.ac.uk" \ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \ - version="2.4.0" \ + version="2.4.1" \ description="cgpRna docker" # Version of tools that are installed in both stages, make sure they are consistent. diff --git a/cwls/tools/merge_and_mark_dups.cwl b/cwls/tools/merge_and_mark_dups.cwl index 666fb62..7abdd90 100644 --- a/cwls/tools/merge_and_mark_dups.cwl +++ b/cwls/tools/merge_and_mark_dups.cwl @@ -20,7 +20,7 @@ doc: | requirements: - class: DockerRequirement - dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + dockerPull: "quay.io/wtsicgp/cgprna:2.4.1" hints: - class: ResourceRequirement diff --git a/cwls/tools/run-cgprna_bigwig.cwl b/cwls/tools/run-cgprna_bigwig.cwl index cb536bf..bd88f1f 100644 --- a/cwls/tools/run-cgprna_bigwig.cwl +++ b/cwls/tools/run-cgprna_bigwig.cwl @@ -20,7 +20,7 @@ doc: | requirements: - class: DockerRequirement - dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + dockerPull: "quay.io/wtsicgp/cgprna:2.4.1" hints: - class: ResourceRequirement diff --git a/cwls/tools/run-cgprna_htseq-count.cwl b/cwls/tools/run-cgprna_htseq-count.cwl index 43eb7de..3addd9b 100644 --- a/cwls/tools/run-cgprna_htseq-count.cwl +++ b/cwls/tools/run-cgprna_htseq-count.cwl @@ -20,7 +20,7 @@ doc: | requirements: - class: DockerRequirement - dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + dockerPull: "quay.io/wtsicgp/cgprna:2.4.1" hints: - class: ResourceRequirement diff --git a/cwls/tools/run-cgprna_star-map.cwl b/cwls/tools/run-cgprna_star-map.cwl index 04c3da9..26f56cb 100644 --- a/cwls/tools/run-cgprna_star-map.cwl +++ b/cwls/tools/run-cgprna_star-map.cwl @@ -20,7 +20,7 @@ doc: | requirements: - class: DockerRequirement - dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + dockerPull: "quay.io/wtsicgp/cgprna:2.4.1" - class: InlineJavascriptRequirement hints: diff --git a/cwls/tools/run-cgprna_stats.cwl b/cwls/tools/run-cgprna_stats.cwl index 0b88681..a9d2a06 100644 --- a/cwls/tools/run-cgprna_stats.cwl +++ b/cwls/tools/run-cgprna_stats.cwl @@ -20,7 +20,7 @@ doc: | requirements: - class: DockerRequirement - dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + dockerPull: "quay.io/wtsicgp/cgprna:2.4.1" hints: - class: ResourceRequirement diff --git a/perl/lib/Sanger/CGP/CgpRna.pm b/perl/lib/Sanger/CGP/CgpRna.pm index 0301cf3..2218fda 100755 --- a/perl/lib/Sanger/CGP/CgpRna.pm +++ b/perl/lib/Sanger/CGP/CgpRna.pm @@ -36,7 +36,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '2.4.0'; +our $VERSION = '2.4.1'; our @EXPORT = qw($VERSION); 1; From 6c4cb379e01fb8a3f490ead6702d952b8059b580 Mon Sep 17 00:00:00 2001 From: Yaobo Xu Date: Mon, 14 Oct 2019 13:40:10 +0000 Subject: [PATCH 4/5] fixed another few colon split --- build/patches/tophat-fusion-post.patch | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build/patches/tophat-fusion-post.patch b/build/patches/tophat-fusion-post.patch index 8dc3a2f..6b1f11c 100644 --- a/build/patches/tophat-fusion-post.patch +++ b/build/patches/tophat-fusion-post.patch @@ -1225,7 +1225,7 @@ def read_dist(params): if line[0] == '@': if line[1:3] == 'SQ': line = line[:-1].split('\t') - contig = line[1].split(':')[1] + contig = line[1].split(':', 1)[1] if contig not in contigs: contigs[contig] = len(contigs) @@ -1533,7 +1533,7 @@ def generate_html(params): else: chr1 = value_str - chr1, chr1_coord = chr1.split(':') + chr1, chr1_coord = chr1.rsplit(':', 1) chr1_left, chr1_right = chr1_coord.split('-') chr1_left, chr1_right = int(chr1_left), int(chr1_right) value_dic["chr1"] = chr1 @@ -1541,7 +1541,7 @@ def generate_html(params): value_dic["chr1_right"] = chr1_right if chr2: - chr2, chr2_coord = chr2.split(':') + chr2, chr2_coord = chr2.rsplit(':', 1) chr2_left, chr2_right = chr2_coord.split('-') chr2_left, chr2_right = int(chr2_left), int(chr2_right) value_dic["chr2"] = chr2 @@ -1592,14 +1592,14 @@ def generate_html(params): exon_list = curr_transcript[-1] if type == "exon": - chr, chr_coord = fields[1].split(':') + chr, chr_coord = fields[1].rsplit(':', 1) chr_left, chr_right = chr_coord.split('-') chr_left, chr_right = int(chr_left), int(chr_right) exon_list.append(["exon", chr, chr_left, chr_right]) else: fusion_left, fusion_right = fields[1].split(' ') - chr1, chr1_pos = fusion_left.split(':') - chr2, chr2_pos = fusion_right.split(':') + chr1, chr1_pos = fusion_left.rsplit(':', 1) + chr2, chr2_pos = fusion_right.rsplit(':', 1) chr1_pos, chr2_pos = int(chr1_pos), int(chr2_pos) exon_list.append(["fusion", chr1, chr1_pos, chr2, chr2_pos]) From 8adce9caa5256e23ae158dae1adce14a143e2353 Mon Sep 17 00:00:00 2001 From: Yaobo Xu Date: Thu, 17 Oct 2019 16:15:25 +0000 Subject: [PATCH 5/5] patching tophat-fusion-post properly --- Dockerfile | 1 + build/opt-build.sh | 2 +- build/patches/tophat-fusion-post.patch | 2998 +----------------------- 3 files changed, 76 insertions(+), 2925 deletions(-) diff --git a/Dockerfile b/Dockerfile index c940bbb..d1bfd16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,7 @@ pkg-config \ zlib1g-dev \ zip \ unzip \ +patch \ libpng-dev \ python3 python3-dev python3-pip python3-setuptools \ r-base r-base-dev \ diff --git a/build/opt-build.sh b/build/opt-build.sh index fae35a6..66e205d 100755 --- a/build/opt-build.sh +++ b/build/opt-build.sh @@ -152,7 +152,7 @@ if [ ! -e $SETUP_DIR/tophat.success ]; then rm -f distro/AUTHORS distro/LICENSE distro/README cp -r distro/* $INST_PATH/bin/. # patch tophat-fusion-post - cp $SCRIPT_PATH/patches/tophat-fusion-post.patch $INST_PATH/bin/tophat-fusion-post + patch $INST_PATH/bin/tophat-fusion-post $SCRIPT_PATH/patches/tophat-fusion-post.patch chmod +x $INST_PATH/bin/tophat-fusion-post touch $SETUP_DIR/tophat.success fi diff --git a/build/patches/tophat-fusion-post.patch b/build/patches/tophat-fusion-post.patch index 6b1f11c..b92b522 100644 --- a/build/patches/tophat-fusion-post.patch +++ b/build/patches/tophat-fusion-post.patch @@ -1,2924 +1,74 @@ -#!/usr/bin/env python - - -""" -tophat-fusion-post - -Created by Daehwan Kim on 2011-05-05. -Copyright (c) 2011 Daehwan Kim. All rights reserved. -""" - -import sys, getopt, warnings -import os, subprocess, errno -import copy -import string, re -import random -from datetime import datetime, date, time -import math -from collections import defaultdict -from intervaltree import Interval, IntervalTree - -use_message = ''' -TopHat-Fusion - -Usage: - tophat-fusion-post [options] - -Options: - -v/--version - -o/--output-dir [ default: ./tophatfusion_out ] - --num-fusion-reads [ default: 3 ] - --num-fusion-pairs [ default: 2 ] - --num-fusion-both [ default: 5 ] - --max-num-fusions [ default: 500 ] - --fusion-read-mismatches [ default: 2 ] - --fusion-multireads [ default: 2 ] - --non-human - - -p/--num-threads [ default: 1 ] - - --no-filter-by-annotation - - --skip-fusion-kmer - --skip-filter-fusion - --skip-blast - --skip-read-dist - --skip-html - - --tex-table - - --fusion-pair-dist [ default: 250 ] -''' - - -class Usage(Exception): - def __init__(self, msg): - self.msg = msg - - -output_dir = "./tophatfusion_out/" -logging_dir = output_dir + "logs/" -tmp_dir = output_dir + "tmp/" - -class TopHatFusionParams: - def __init__(self, - keep_tmp = ""): - self.keep_tmp = keep_tmp - - self.num_fusion_reads = 3 - self.num_fusion_pairs = 2 - self.num_fusion_both = 0 - self.max_num_fusions = 500 - - self.fusion_read_mismatches = 2 - self.fusion_multireads = 2 - - self.is_human = True - - self.num_threads = 1 - - self.filter_by_annotation = True - - self.skip_fusion_kmer = False - self.skip_filter_fusion = False - self.skip_blast = False - self.skip_read_dist = False - self.skip_html = False - - self.tex_table = False - - self.fusion_pair_dist = 250 - - def check(self): - if False: - die("Error: arg to --num-threads must be greater than 0") - - def parse_options(self, argv): - try: - opts, args = getopt.getopt(argv[1:], "hp:o:", - ["version", - "help", - "output-dir=", - "num-fusion-reads=", - "num-fusion-pairs=", - "num-fusion-both=", - "max-num-fusions=", - "fusion-read-mismatches=", - "fusion-multireads=", - "non-human", - "num-threads=", - "no-filter-by-annotation", - "skip-fusion-kmer", - "skip-filter-fusion", - "skip-blast", - "skip-read-dist", - "skip-html", - "tex-table", - "gtf-file=", - "fusion-pair-dist="]) - except getopt.error, msg: - raise Usage(msg) - - for option, value in opts: - if option in ("-v", "--version"): - print "TopHat v%s" % (get_version()) - sys.exit(0) - if option in ("-h", "--help"): - raise Usage(use_message) - if option == "--num-fusion-reads": - self.num_fusion_reads = int(value) - if option == "--num-fusion-pairs": - self.num_fusion_pairs = int(value) - if option == "--num-fusion-both": - self.num_fusion_both = int(value) - if option == "--max-num-fusions": - self.max_num_fusions = int(value) - if option == "--fusion-read-mismatches": - self.fusion_read_mismatches = int(value) - if option == "--fusion-multireads": - self.fusion_multireads = int(value) - if option == "--non-human": - self.is_human = False - if option in ("-p", "--num-threads"): - self.num_threads = int(value) - if option in ("-o", "--output-dir"): - global output_dir, logging_dir, tmp_dir - output_dir = value + "/" - logging_dir = output_dir + "logs/" - tmp_dir = output_dir + "tmp/" - if option == "--no-filter-by-annotation": - self.filter_by_annotation = False - if option == "--skip-fusion-kmer": - self.skip_fusion_kmer = True - if option == "--skip-filter-fusion": - self.skip_filter_fusion = True - if option == "--skip-blast": - self.skip_blast = True - if option == "--skip-read-dist": - self.skip_read_dist = True - if option == "--skip-html": - self.skip_html = True - if option == "--tex-table": - self.tex_table = True - if option == "--gtf-file": - self.GTF_file = value - if option == "--fusion-pair-dist": - self.fusion_pair_dist = int(value) - - if len(args) < 1: - raise Usage(use_message) - - return args - - -# Returns the current time in a nice format -def right_now(): - curr_time = datetime.now() - return curr_time.strftime("%c") - - -def reverse_complement(seq): - result = "" - for nt in seq: - base = nt - if nt == 'A': - base = 'T' - elif nt == 'a': - base = 't' - elif nt == 'C': - base = 'G' - elif nt == 'c': - base = 'g' - elif nt == 'G': - base = 'C' - elif nt == 'g': - base = 'c' - elif nt == 'T': - base = 'A' - elif nt == 't': - base = 'a' - - result = base + result - - return result - - -def get_chromosome_order(bowtie_index, chrs): - bowtie_header_cmd = ['bowtie', - '--sam', - bowtie_index, - '/dev/null'] - - bowtie_header_filename = tmp_dir + "sam_header" - subprocess.call(bowtie_header_cmd, - stdout=open(bowtie_header_filename, "w"), - stderr=open("/dev/null")) - - bowtie_header_file = open(bowtie_header_filename) - for line in bowtie_header_file: - index = string.find(line, "SN:") - if index != -1: - SN = line[index+3:].split() - chrs.append(SN[0]) - - bowtie_header_file.close() - - -# Ensures that the output, logging, and temp directories are present. If not, -# they are created -def prepare_output_dir(): - if os.path.exists(output_dir): - pass - else: - os.mkdir(output_dir) - - if os.path.exists(logging_dir): - pass - else: - os.mkdir(logging_dir) - - if os.path.exists(tmp_dir): - pass - else: - try: - os.makedirs(tmp_dir) - except OSError, o: - die("\nError creating directory %s (%s)" % (tmp_dir, o)) - - -def check_samples(): - sample_list_filename = output_dir + 'sample_list.txt' - - prev_list, curr_list = [], [] - if os.path.exists(sample_list_filename): - sample_list = open(sample_list_filename, 'r') - for line in sample_list: - prev_list.append(line[:-1]) - sample_list.close() - - for dir in sorted(os.listdir('.')): - if string.find(dir, "tophat_") != 0: - continue - - fusion = dir + "/fusions.out" - if not os.path.exists(fusion): - continue - - curr_list.append(dir[7:]) - - if prev_list != curr_list: - sample_list = open(sample_list_filename, 'w') - for sample in curr_list: - print >> sample_list, sample - sample_list.close() - return True - else: - return False - - -def map_fusion_kmer(bwt_idx_prefix, params, sample_update = False): - def get_fusion_seq(): - seq_dic = {} - for dir in sorted(os.listdir('.')): - if string.find(dir, "tophat_") != 0: - continue - - fusion = dir + "/fusions.out" - if not os.path.exists(fusion): - continue - - fusion_file = open(fusion, 'r') - fusion_file.readline() - for line in fusion_file: - left_seq, right_seq = line[:-1].split('\t@\t')[2:4] - left_seq = left_seq.split(' ')[0] - right_seq = right_seq.split(' ')[1] - - if len(left_seq) < 23 or len(right_seq) < 23: - continue - - seq_dic[left_seq[-23:]] = 1 - seq_dic[right_seq[:23]] = 1 - - fusion_file.close() - - fusion_seq_fa = open(output_dir + "fusion_seq.fa", 'w') - for seq in seq_dic.keys(): - print >> fusion_seq_fa, ">%s" % seq - print >> fusion_seq_fa, seq - - fusion_seq_fa.close() - - def convert_bowtie(): - bwt_dic = {} - bwtout_file = open(output_dir + "fusion_seq.bwtout", 'r') - for line in bwtout_file: - seq, temp, chr, coord = line[:-1].split('\t')[0:4] - if seq in bwt_dic: - bwt_dic[seq].append(chr + ":" + coord) - else: - bwt_dic[seq] = [chr + ":" + coord] - - bwtout_file.close() - - kmer_map = open(fusion_kmer_file_name, 'w') - for seq, chrs in bwt_dic.items(): - print >> kmer_map, "%s\t%s" % (seq, ','.join(chrs)) - - kmer_map.close() - - print >> sys.stderr, "[%s] Extracting 23-mer around fusions and mapping them using Bowtie" % right_now() - if sample_update: - print >> sys.stderr, "\tsamples updated" - - fusion_kmer_file_name = output_dir + "fusion_seq.map" - - if not os.path.exists(fusion_kmer_file_name) or \ - os.stat(fusion_kmer_file_name).st_size <= 0 or \ - sample_update: - get_fusion_seq() - cmd = ['bowtie', '-p', '8', '-a', '-n', '3', '-m', '100', bwt_idx_prefix, '-f', '%sfusion_seq.fa' % output_dir] - subprocess.call(cmd, stdout=open(output_dir + 'fusion_seq.bwtout', 'w'), stderr=open('/dev/null', 'w')) - convert_bowtie() - - -def filter_fusion(bwt_idx_prefix, params): - chrs = [] - get_chromosome_order(bwt_idx_prefix, chrs) - - chr_order = {} - for chr in chrs: - chr_order[chr] = len(chr_order) - - ### Compute number of paired reads ### - # Helper classes - class GeneEntry(object): - fields = [('bin', int), - ('name', str), - ('chrom', str), - ('strand', str), - ('txStart', int), - ('txEnd', int), - ('cdsStart', int), - ('cdsEnd', int), - ('exonCount', int), - ('exonStarts', lambda x: [int(y) for y in list(x.split(',')[:-1])]), - ('exonEnds', lambda x: [int(y) for y in list(x.split(',')[:-1])]), - ('score', int), - ('name2', str), - ('cdsStartStat', str), - ('cdsEndStat', str), - ('exonFrames', lambda x: [int(y) for y in list(x.split(',')[:-1])])] - - '''Representation of one line in the refGene.txt file.''' - def __init__(self, entry_string): - tokens = entry_string.split('\t') # Assumes tab delimited. - for (field, type_), token in zip(self.fields, tokens): - setattr(self, field, type_(token)) - - def __str__(self): - return str(self.__dict__) - - def __repr__(self): - return str(self) - - class Fusion(object): - def __init__(self, info, pairs): - # args should be info[:4] from the fusions.out entry - # Parse the chromosome string - self.chrL, self.chrR = info[0].rstrip().split('-') - - # Fusion positions - self.posL = int(info[1]) - self.posR = int(info[2]) - - # Strand - self.strandL = info[3][0] - self.strandR = info[3][1] - - # Pairs - pairs = pairs.rstrip() - if pairs: - self.pairs = [pair.split(':') for pair in pairs.split(' ')] - self.pairs = [(int(p1), int(p2)) for (p1, p2) in self.pairs] - else: - self.pairs = [] - - def get_sign(self, which): - switch = which == 'R' - strand = self.strandL if which == 'L' else self.strandR - if strand == 'r': - return -1 if not switch else 1 - else: - return 1 if not switch else -1 - - class TransMaps(object): - def __init__(self, fusion, juncs): - self.fusion = fusion - self.chroms = (fusion.chrL, fusion.chrR) - self.maps = {} - self.starts = {} - self.juncs = juncs - - def add_map(self, chrom, start, stop, strand, fusion_pos): - self.maps[chrom, strand] = self.compute_transcript_map(chrom, start, stop, strand, fusion_pos) - self.starts[chrom, strand] = start - - def map(self, chrom, pos, strand): - pos2 = pos - self.starts[chrom,strand] - the_map = self.maps[chrom,strand] - if pos2 < 0 or pos2 >= len(the_map): - # out of bounds - return distance to boundary - if pos2 < 0: - return the_map[0] - pos2 - else: - return the_map[-1] + (pos2 - len(the_map) + 1) - else: - # position in the map - return the_map[pos2] - - def compute_transcript_map(self, chrom, start, stop, strand, fusion_pos): - #return list(range(0, stop-start+1)) - - chrom = self.chroms[chrom] - w = stop - start + 1 - - # Find junctions within the interval - # strict=True means that only intervals entirely contained are returned - # The junction dict contains both {end: start} and {start: end} - junctions = defaultdict(set) - for junc in self.juncs[chrom].search(start, stop, strict=True): - if junc[2][0] == strand: - junctions[junc[1]-start].add(junc[0]-start) - junctions[junc[0]-start].add(junc[1]-start) - - # Initialize distance vector - distance = [abs(ii-fusion_pos) for ii in range(start, stop+1)] - - # Working out from fusion position, follow junctions - # First, get positions in the right order - fusii = fusion_pos - start - N = len(distance) - positions = [fusii] - up = 1 - down = 1 - while len(positions) < len(distance): - if fusii - down >= 0: - positions.append(fusii-down) - down += 1 - if fusii + up < N: - positions.append(fusii+up) - up += 1 - - # Second, compute the transcript distance at each position - # Skip the first position, which is the fusion break (distance=0) - for ii in positions[1:]: - ii_ = ii + (1 if fusii-ii >= 0 else -1) - if ii in junctions: - # If ends of junction are further away, ignore them. - # If ends of junction are closer to fusion, shorten distance - distance[ii] = min( [distance[ii_]] + [distance[jj] for jj in junctions[ii] - if abs(jj-fusii) < abs(ii-fusii)] ) + 1 - else: - distance[ii] = distance[ii_] + 1 - - # Re-sign the distances - # Users downstream will need to work out fusion arm orientation - distance[:fusii] *= -1 - return distance - - # Helper functions - def load_junctions(refgene_file, ensgene_file, juncs_file): - def _load(gene_file, _junctions): - for line in open(gene_file): - entry = GeneEntry(line) - # Iterate over introns - for start, stop in zip(entry.exonEnds[:-1], entry.exonStarts[1:]): - _junctions[entry.chrom].addi(start, stop+1, (entry.strand, 'intron')) - - # junctions are Interval(start, stop, (strand, ...)) - _junctions = defaultdict(IntervalTree) - - # Load exons from refGene.txt and ensGene.txt - if refgene_file is not None: - _load(refgene_file, _junctions) - - if ensgene_file is not None: - _load(ensgene_file, _junctions) - - # Load junctions from junctions.bed - if juncs_file is not None: - for line in open(juncs_file): - if line[:5] == 'track': - continue - chrom, start, stop, _, _, strand, _, _, _, _, overhangs, _ = line.split('\t') - a,b = overhangs.split(',') - _junctions[chrom].addi( int(start)+int(a), int(stop)-int(b)+2, (strand, 'intron') ) - - return _junctions - - def get_transcript_maps(fusion, junctions): - # Find left side range - p1s = [p1 for p1, _ in fusion.pairs] - dL = max([0, max(p1s)]) - dL2 = min([0, min(p1s)]) - - s = fusion.get_sign('L') - startL = fusion.posL - s*dL - endL = fusion.posL - s*dL2 - if startL > endL: - startL, endL = endL, startL - - # Find right side range - p2s = [p2 for _, p2 in fusion.pairs] - dR = max([0, max(p2s)]) - dR2 = min([0, min(p2s)]) - - s = fusion.get_sign('R') - startR = fusion.posR - s*dR - endR = fusion.posR - s*dR2 - if startR > endR: - startR, endR = endR, startR - - # Make maps - tmaps = TransMaps(fusion, junctions) - tmaps.add_map(0, startL, endL, '-', fusion.posL) - tmaps.add_map(0, startL, endL, '+', fusion.posL) - tmaps.add_map(1, startR, endR, '-', fusion.posR) - tmaps.add_map(1, startR, endR, '+', fusion.posR) - - for m in tmaps.maps.values(): - if len(m)==0: - raise Exception() - - return tmaps - - def pick_short(a, b): - return a if abs(a) < abs(b) else b - - def convert_pair_distances(fusion, junctions): - # Define transcript maps - # These map genomic coordinates to transcript coordinates. - tmaps = get_transcript_maps(fusion, junctions) - - # Compute inner distances - # Because we don't know which strand, we'll take the min of the - # transcript distances on each strand. - pairs = [] - for p1, p2 in fusion.pairs: - s1 = fusion.get_sign('L') - p1 = p1 * s1 - d1 = s1* pick_short( tmaps.map(0, fusion.posL, '-') - tmaps.map(0, fusion.posL - p1, '-'), - tmaps.map(0, fusion.posL, '+') - tmaps.map(0, fusion.posL - p1, '+') ) - - s2 = fusion.get_sign('R') - p2 = p2 * s2 - d2 = s2* pick_short( tmaps.map(1, fusion.posR, '-') - tmaps.map(1, fusion.posR - p2, '-'), - tmaps.map(1, fusion.posR, '+') - tmaps.map(1, fusion.posR - p2, '+') ) - - pairs.append((d1,d2)) - - return pairs - - def get_valid_pairs(info, pairs, junctions): - # This method converts the distances from spanning pairs to the fusion - # from genomic to transcript coordinates - i.e. how far are the pairs - # in the processed transcript. - # Parse fusion - fusion = Fusion(info, pairs) - - if len(fusion.pairs) == 0: - return [] - - # Convert pairs to transcript coordinates - tpairs = convert_pair_distances(fusion, junctions) - - # Return pairs with inner distance less than the threshold - - pairs = [(a,b) for (a,b) in tpairs if abs(a)+abs(b) <= params.fusion_pair_dist] - return pairs - - ## Filter fusion implementation ## - def filter_fusion_impl(fusion, junctions, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list): - def gene_exists(gene_list, chr, coord, dir, is_left): - min = 0 - max = len(gene_list) - 1 - while max - min >= 0: - mid = (min + max) / 2 - gene = gene_list[mid] - ref_chr = gene[1] - - if chr != ref_chr: - if chr_order[chr] < chr_order[ref_chr]: - max = mid - 1 - else: - min = mid + 1 - continue - - left_coord = gene[2] - right_coord = gene[3] - - if coord >= left_coord and coord <= right_coord: - left_coords = gene[5] - right_coords = gene[6] - sense = gene[7] - - belong = False - where = "outside" - for i in range(len(left_coords)): - # gives some relax! - relax = 3 - - left = int(left_coords[i]) - 1 - right = int(right_coords[i]) - 1 - if coord <= right + relax: - if coord < left - relax: - where = "intron%d(%d-%d)" % (i, int(right_coords[i-1]), left - 1) - else: - if ((is_left and dir == "f") or (not is_left and dir == "r")) and abs(coord - right) <= relax: - belong = True - - if ((is_left and dir == "r") or (not is_left and dir == "f")) and abs(coord - left) <= relax: - belong = True - - where = "exon%d(%d-%d)" % (i + 1, left, right) - break - - return [gene[0], gene[4], where, belong, sense] - elif coord < left_coord: - max = mid - 1 - else: - min = mid + 1 - - return ["N/A", "N/A", "N/A", False, "N/A"] - - def how_diff(first, second): - seq_len = len(first) - - min_value = 10000 - prev, curr = [0 for i in range(seq_len)], [0 for i in range(seq_len)] - for j in range(seq_len): - for i in range(seq_len): - value = 10000 - if first[i] == second[j]: - match = 0 - else: - match = 1 - - # right - if i == 0: - value = j * 2 + match - elif j > 0: - value = prev[i] + 2 - - temp_value = 10000 - - # down - if j == 0: - temp_value = i * 2 + match - elif i > 0: - temp_value = curr[i-1] + 2 - - if temp_value < value: - value = temp_value - - # match - if i > 0 and j > 0: - temp_value = prev[i-1] + match - - if temp_value < value: - value = temp_value - - curr[i] = value - - if (i == seq_len - 1 or j == seq_len - 1) and value < min_value: - min_value = value - - prev, curr = curr, prev - - return min_value - - kmer_len = len(seq_chr_dic.keys()[0]) - sample_name = fusion.split("/")[0][len("tophat_"):] - - data = os.getcwd().split('/')[-1] - - fusion_file = open(fusion, 'r') - fusion_file.readline() - for line in fusion_file: - # Parse the fusion entry - info, sim, left_seq_org, right_seq_org, left_dist, right_dist, pair_list = line[:-1].split('\t@\t')[:7] - info = info.split('\t') - if sim.strip() == "": - continue - sim = sim.split(' ') - left_seq = left_seq_org.replace(' ', '') - right_seq = right_seq_org.replace(' ', '') - - num_reads = int(info[4]) - - # Correct the number of spanning pairs by filtering on the inner distance - # based on transcript coordinates instead of genomic coordinates - pairs = get_valid_pairs(info[:4], pair_list, junctions) - num_pair_ends = len(pairs) - - # Format pairs for usage downstream - pairs = ['{0}:{1}'.format(a,b) for (a,b) in pairs] - - num_pair_ends_fusion = int(info[6]) - num_pair_ends_both = int(num_pair_ends + num_pair_ends_fusion * 0.5) - num_contracting_reads = int(info[7]) - left_ext = int(info[8]) - right_ext = int(info[9]) - sym = float(info[10]) - - half_len = len(left_seq) / 2 - chr1, chr2 = info[0].split('-')[:2] - coord1, coord2 = int(info[1]), int(info[2]) - dir = info[3] - - if string.find(sample_name, "single") != -1: - single = True - else: - single = False - - # Begin fusion filters - if left_ext < 16 or right_ext < 16: - continue - - both = num_reads + num_pair_ends_both - all = both - - if num_pair_ends > num_reads * 50: - continue - - if num_reads < params.num_fusion_reads or \ - num_pair_ends < params.num_fusion_pairs or \ - both < params.num_fusion_both: - continue - - """ - if (chr1 != chr2 and num_contracting_reads > num_reads) or \ - (chr1 == chr2 and num_contracting_reads > all + num_pair_ends + 5): - continue - """ - - # are the sequences around the breakpoint different enough? - if int(sim[0]) < 8: - continue - - # are the reads distributed symmetrically? - if sym >= 22 + max(0, 6 - num_reads): - continue - - max_intron_len = 100000 - if chr1 == chr2 and dir == "ff": - coord_dif = coord2 - coord1 - if coord_dif > 0 and coord_dif < max_intron_len: - continue - - if not left_seq[half_len-kmer_len:half_len] in seq_chr_dic or not right_seq[half_len:half_len+kmer_len] in seq_chr_dic: - continue - - left_chrs = seq_chr_dic[left_seq[half_len-kmer_len:half_len]] - right_chrs = seq_chr_dic[right_seq[half_len:half_len+kmer_len]] - - if chr1 == chr2: - max_intron_len = min(max_intron_len, abs(coord1 - coord2) * 9 / 10) - - same = False - for chr_coord in left_chrs: - chr, coord = chr_coord.rsplit(':', 1) - coord = int(coord) - if chr == chr2 and abs(coord - coord2) < max_intron_len: - same = True - break - - if same: - continue - - for chr_coord in right_chrs: - chr, coord = chr_coord.rsplit(':', 1) - coord = int(coord) - if chr == chr1 and abs(coord - coord1) < max_intron_len: - same = True - break - - if same: - continue - - def find_gene(chr, coord, one_dir, is_left): - result = [] - for gene_list in [refGene_list, ensGene_list]: - result.append(gene_exists(gene_list, chr, coord, one_dir, is_left)) - - if result[0][0] == "N/A": - return result[1] + result[1][:2] - else: - return result[0] + result[1][:2] - - dir = info[3] - gene1, gene1_name, gene1_where, gene1_belong, gene1_sense, ens_gene1, ens_gene1_name = find_gene(chr1, coord1, dir[0], True) - gene2, gene2_name, gene2_where, gene2_belong, gene2_sense, ens_gene2, ens_gene2_name = find_gene(chr2, coord2, dir[1], False) - if params.filter_by_annotation: - if gene1_name == gene2_name or ens_gene1_name == ens_gene2_name or ens_gene1 == ens_gene2: - continue - - if gene1 == "N/A" or gene2 == "N/A" or (string.find(gene1, "ENS") == 0 and string.find(gene2, "ENS") == 0): - continue - - left_diff = how_diff(left_seq[half_len - 20:half_len], right_seq[half_len - 20:half_len]) - if left_diff <= 8: - continue - - right_diff = how_diff(left_seq[half_len:half_len+20], right_seq[half_len:half_len+20]) - if right_diff <= 8: - continue - - if left_diff + right_diff < 20: - continue - - left_dist = left_dist.strip().split(' ') - right_dist = right_dist.strip().split(' ') - - for i in range(len(left_dist)): - left_dist[i] = '%d' % min(9, int(left_dist[i])) - right_dist[i] ='%d' % min(9, int(right_dist[i])) - - swap = False - if (dir == 'ff' and gene1_sense == '-' and gene2_sense == '-') or \ - (dir == 'rr' and gene1_sense == '+' and gene2_sense == '+') or \ - (dir == 'fr' and gene1_sense == '-' and gene2_sense == '+') or \ - (dir == 'rf' and gene1_sense == '+' and gene2_sense == '-'): - swap = True - - if swap: - if dir == 'ff': - dir = 'rr' - elif dir == 'rr': - dir = 'ff' - - info[0] = "%s-%s" % (chr2, chr1) - info[1:3] = info[1:3][::-1] - info[3] = dir - info[8:10] = info[8:10][::-1] - left_seq_org, right_seq_org = reverse_complement(right_seq_org), reverse_complement(left_seq_org) - left_dist, right_dist = right_dist, left_dist - gene1_name, gene1_where, gene1_sense, gene2_name, gene2_where, gene2_sense = \ - gene2_name, gene2_where, gene2_sense, gene1_name, gene1_where, gene1_sense - for j in range(len(pairs)): - pair = pairs[j].split(':') - pairs[j] = ':'.join(pair[::-1]) - - # check if this is due to trans-splicing. - if gene1_sense in "+-" and gene2_sense in "+-": - if ((dir == 'ff' or dir == 'rr') and gene1_sense != gene2_sense) or \ - (dir == 'fr' and (gene1_sense != '+' or gene2_sense != '-')) or \ - (dir == 'rf' and (gene1_sense != '-' or gene2_sense != '+')): - # print >> sys.stderr, "fusion due to trans-splicing", info - # print >> sys.stderr, gene1, gene1_name, gene1_where, gene1_sense - # print >> sys.stderr, gene2, gene2_name, gene2_where, gene2_sense - None - - fusion_gene = [] - info[5] = str(num_pair_ends) - fusion_gene.append(sample_name + ' ' + ' '.join(info[:10])) - fusion_gene.append(left_seq_org) - fusion_gene.append(right_seq_org) - fusion_gene.append("%s %s" % (''.join(left_dist[::-1]), ''.join(right_dist))) - - fusion_gene.append("%s %s %s %s" % (gene1_name, gene1_where, gene2_name, gene2_where)) - fusion_gene.append(" ".join(pairs)) - - fusion_gene_list.append(fusion_gene) - - fusion_file.close() - - print >> sys.stderr, "[%s] Filtering fusions" % right_now() - - seq_chr_dic = {} - seq_chr_file = open(output_dir + "fusion_seq.map", 'r') - for line in seq_chr_file: - seq, chrs = line[:-1].split('\t') - chrs = chrs.split(',') - seq_chr_dic[seq] = chrs - - seq_chr_file.close() - - re_mir = re.compile(r'^(MIR)') - def read_genes(gene_file_name, offset = 1, id = -4): - gene_list, temp_gene_list = [], [] - if not os.path.exists(gene_file_name): - return gene_list - - gene_file = open(gene_file_name, 'r') - for line in gene_file: - line = line[:-1].split('\t')[offset:] - num_exons = int(line[7]) - left_coords = line[8].split(',')[:num_exons] - right_coords = line[9].split(',')[:num_exons] - - if line[1] in chr_order and not re_mir.findall(line[id]): - temp_gene_list.append([line[0], line[1], int(line[3]), int(line[4]), line[id], left_coords, right_coords, line[2]]) - - gene_file.close() - - def my_cmp(a, b): - if a[1] != b[1]: - if chr_order[a[1]] < chr_order[b[1]]: - return -1 - else: - return 1 - else: - if a[2] != b[2]: - if a[2] < b[2]: - return -1 - else: - return 1 - else: - if a[3] > b[3]: - return -1 - else: - return 1 - - temp_gene_list = sorted(temp_gene_list, cmp=my_cmp) - gene_list = [] - if len(temp_gene_list) >= 1: - gene_list.append(temp_gene_list[0]) - - # remove overlapping genes, that is, the longest genes are preferred - for i in range(1, len(temp_gene_list)): - gene1 = temp_gene_list[i-1] - gene2 = temp_gene_list[i] - - if gene1[1] == gene2[1] and gene1[3] >= gene2[3]: - continue - - gene_list.append(gene2) - - return gene_list - - refGene_list, ensGene_list = [], [] - if os.path.exists("refGene.txt"): - refGene_list = read_genes("refGene.txt") - if os.path.exists("ensGene.txt"): - ensGene_list = read_genes("ensGene.txt") - - fusion_gene_list = [] - for file in sorted(os.listdir(".")): - if string.find(file, "tophat_") != 0: - continue - - fusion_file = file + "/fusions.out" - if not os.path.exists(fusion_file): - continue - - juncs_file = file + '/junctions.bed' - if not os.path.exists(juncs_file): - juncs_file = None - print >> sys.stderr, 'Warning: could not find juctions.bed (%s).' % (juncs_file) - - ref_file = "refGene.txt" - if not os.path.exists(ref_file): - ref_file = None - - ens_file = "ensGene.txt" - if not os.path.exists(ens_file): - ens_file = None - - if juncs_file is None and ref_file is None and ens_file is None: - print >> sys.stderr, 'Warning: neither junctions.bed nor ref/ensGene.txt found.' - - junctions = load_junctions(ref_file, ens_file, juncs_file) - - print >> sys.stderr, "\tProcessing:", fusion_file - filter_fusion_impl(fusion_file, junctions, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list) - - fusion_out_file = output_dir + "potential_fusion.txt" - output_file = open(fusion_out_file, 'w') - for fusion_gene in fusion_gene_list: - for line in fusion_gene: - print >> output_file, line - output_file.close() - print >> sys.stderr, '\t%d fusions are output in %s' % (len(fusion_gene_list), fusion_out_file) - - - -def parallel_work(pids, work): - child = -1 - for i in range(len(pids)): - if pids[i] == 0: - child = i - break - - while child == -1: - status = os.waitpid(0, 0) - for i in range(len(pids)): - if status[0] == pids[i]: - child = i - pids[i] = 0 - break - - child_id = os.fork() - if child_id == 0: - work() - os._exit(os.EX_OK) - else: - # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id) - pids[child] = child_id - - -def wait_pids(pids): - for pid in pids: - if pid > 0: - os.waitpid(pid, 0) - - -def do_blast(params): - print >> sys.stderr, "[%s] Blasting 50-mers around fusions" % right_now() - - file_name = output_dir + "potential_fusion.txt" - blast_genomic_out = output_dir + "blast_genomic" - blast_nt_out = output_dir + "blast_nt" - - if not os.path.exists(blast_genomic_out): - os.system("mkdir %s" % blast_genomic_out); - - if not os.path.exists(blast_nt_out): - os.system("mkdir %s" % blast_nt_out) - - pids = [0 for i in range(params.num_threads)] - - count = 0 - line_no = 0 - - check_list = [] - output_list = [] - output = "" - file = open(file_name, 'r') - for line in file: - if line_no % 6 == 0: - count += 1 - - if line_no % 6 == 1: - left_seq = line[:-1].split(" ")[0] - - if line_no % 6 == 2: - right_seq = line[:-1].split(" ")[1] - - if line_no % 6 == 4: - def blast(database, seq, outdir): - file_name = "%s/%s" % (outdir, seq) - if os.path.exists(file_name): - return - - blast_cmd = "echo %s | blastn -db %s -evalue 1e-10 -word_size 28" - # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-10 -W 28" - output = os.popen(blast_cmd % (seq, database)).read() - - if str.find(output, "No hits found") != -1: - blast_cmd = "echo %s | blastn -db %s -evalue 1e-5" - # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-5" - output = os.popen(blast_cmd % (seq, database)).read() - - pos1 = str.find(output, ">ref|") - pos2 = str.find(output, "Database: ", pos1) - - if pos1 != -1 and pos1 < pos2: - output = output[pos1:pos2].rstrip() - else: - output = "" - - file = open(file_name, "w") - file.write(output) - file.close() - - seq = left_seq + right_seq - - def work(): - if params.is_human: - blast_genomic = "blast/human_genomic" - else: - blast_genomic = "blast/other_genomic" - - blast_nt = "blast/nt" - - blast(blast_genomic, left_seq, blast_genomic_out) - blast(blast_genomic, right_seq, blast_genomic_out) - blast(blast_genomic, seq, blast_genomic_out) - blast(blast_nt, left_seq, blast_nt_out) - blast(blast_nt, right_seq, blast_nt_out) - blast(blast_nt, seq, blast_nt_out) - - if not os.path.exists(output_dir + "blast_nt/" + seq ): - print >> sys.stderr, "\t%d. %s" % (count, line[:-1]) - if params.num_threads <= 1: - work() - else: - parallel_work(pids, work) - - line_no += 1 - - if params.num_threads > 1: - wait_pids(pids) - - -def read_dist(params): - def alignments_region(sample_name, bam, alignments_list): - def output_region(sample_name, fusion, reads): - chr_ref, pos1_ref, pos2_ref, dir_ref = fusion - chr1_ref, chr2_ref = chr_ref.split('-') - - if len(reads) <= 0: - return - - alignments_file_name = "%s/%s_%s_%d_%d_%s" % (alignments_dir, sample_name, chr_ref, pos1_ref, pos2_ref, dir_ref) - alignments_file = open(alignments_file_name, "w") - - def my_cmp(a, b): - if a[3] and not b[3]: - return -1 - elif not a[3] and b[3]: - return 1 - - if a[3]: - if dir_ref[0] == "f": - return a[4] - b[4] - else: - return b[4] - a[4] - else: - if dir_ref[1] == "f": - return a[4] - b[4] - else: - return b[4] - a[4] - - reads = sorted(reads, cmp=my_cmp) - begin_pos = reads[0][4] - dist_to_breakpoint = abs(pos1_ref - begin_pos) - for read in reads: - seq_pos = 0 - read_id, chr1, chr2, before, left_pos, right_pos, cigars, seq, qual, mismatch, left = read[:-1] - space_len = 0 - if before: - space_len = abs(left_pos - begin_pos) - else: - space_len = abs(pos1_ref - begin_pos) + 2 - if dir_ref[1] == "f": - space_len += (left_pos - pos2_ref) - else: - space_len += (pos2_ref - left_pos) - - qual_str = seq_str = ' ' * space_len - - saw_fusion = False - for cigar in cigars: - length = int(cigar[:-1]) - cigar_op = cigar[-1] - - if cigar_op in "Mm": - seq_str += seq[seq_pos:seq_pos + length] - qual_str += qual[seq_pos:seq_pos + length] - seq_pos += length - - if cigar_op in "Nn": - seq_str += ('|' * length) - qual_str += ('|' * length) - - if cigar_op in "Dd": - seq_str += ("D" * length) - qual_str += ("D" * length) - - if cigar_op in "Ii": - seq_pos += length - - if cigar_op in "F": - seq_str += " " - qual_str += " " - - cigar_str = ''.join(cigars) - prefix_str = '%s %s %d %d %s' % (chr1, chr2, left_pos, right_pos, cigar_str) - prefix_str += ' ' * (60 - len(prefix_str)) - - L = "L" - if not left: - L = "R" - - begin, end = max(0, dist_to_breakpoint - within), dist_to_breakpoint + within - output = '%s %s\n' % (prefix_str, seq_str[begin:end]) - alignments_file.write(output) - - alignments_file.close() - - - cigar_re = re.compile('\d+\w') - within = 300 - - old_chr = "" - reads_list = [[] for i in range(len(alignments_list))] - reads_compress_list = [[1, 0] for i in range(len(alignments_list))] - - cmd = ['samtools', 'view', '-h', bam] - popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=open('/dev/null')) - - contigs = {} - for line in popen.stdout: - if line[0] == '@': - if line[1:3] == 'SQ': - line = line[:-1].split('\t') - contig = line[1].split(':', 1)[1] - if contig not in contigs: - contigs[contig] = len(contigs) - - continue - - line = line[:-1].split('\t') - - read_id, chr1, left_pos, cigar_str, seq, qual = line[0], line[2], int(line[3]) - 1, line[5], line[9], line[10] - - if chr1 != old_chr: - old_chr = chr1 - # print "%s - %s" % (sample_name, chr1) - - secondary_fusion_alignment = False - num_hits = 0 - for field in line[10:]: - if field[:5] == "XF:Z:": - if field[5] == '2': - secondary_fusion_alignment = True - - chr1, left_pos, cigar_str, seq, qual = field[7:].split() - left_pos = int(left_pos) - 1 - - if field[:5] == "NM:i:": - mismatch = int(field.split(':')[-1]) - - if field[:5] == "NH:i:": - num_hits = int(field.split(':')[-1]) - - if num_hits > params.fusion_multireads: - continue - - if secondary_fusion_alignment: - continue - - flag = int(line[1]) - left = (flag & 64 != 0) - antisense = (flag & 16 != 0) - - if '-' in chr1: - chr1, chr2 = chr1.split('-') - else: - chr2 = chr1 - - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - dir = "ff" - saw_fusion = False - for cigar in cigars: - length = int(cigar[:-1]) - cigar_op = cigar[-1] - - if cigar_op in "MDN": - right_pos += length - - if saw_fusion: - dir = dir[0] + "f" - else: - dir = "f" + dir[1] - - if cigar_op in "mdn": - right_pos -= length - - if saw_fusion: - dir = dir[0] + "r" - else: - dir = "r" + dir[1] - - if cigar_op in "F": - if dir[0] == "f": - pos1 = right_pos - 1 - else: - pos1 = right_pos + 1 - - pos2 = right_pos = length - 1 - saw_fusion = True - - if cigar_op in "iIdD": - mismatch -= length - - if not saw_fusion: - report_idx = -1 - for i in range(len(alignments_list)): - chr_ref, pos1_ref, pos2_ref = alignments_list[i][:3] - chr1_ref, chr2_ref = chr_ref.split('-') - - """ - # a fusion point can be given like "chr9-chr1" - if (contigs[chr1_ref] < contigs[chr1] or (contigs[chr1_ref] == contigs[chr1] and left_pos - pos1_ref > 1000000)) and \ - (contigs[chr2_ref] < contigs[chr1] or (contigs[chr2_ref] == contigs[chr1] and left_pos - pos2_ref > 1000000)): - report_idx = i - else: - break - """ - - if report_idx >= 0: - for j in range(report_idx+1): - output_region(sample_name, alignments_list[j], reads_list[j]) - - alignments_list = alignments_list[report_idx+1:] - reads_list = reads_list[report_idx+1:] - - if len(alignments_list) <= 0: - break - - fusion_read = saw_fusion - if mismatch > params.fusion_read_mismatches: - continue - - for i in range(len(alignments_list)): - chr_ref, pos1_ref, pos2_ref, dir_ref = alignments_list[i] - chr1_ref, chr2_ref = chr_ref.split('-') - - def reverse_read(fusion_read, dir, antisense, left_pos, right_pos, fusion_left, seq, qual, cigars): - if dir == 'ff' or not fusion_read: - left_pos, right_pos = right_pos - 1, left_pos - 1 - elif dir == 'fr': - left_pos, right_pos = right_pos + 1, left_pos - 1 - elif dir == 'rf': - left_pos, right_pos = right_pos - 1, left_pos + 1 - elif dir == 'rr': - left_pos, right_pos = right_pos + 1, left_pos + 1 - - reversed_cigars = [] - cigars.reverse() - for i in range(len(cigars)): - cigar = cigars[i] - opcode = cigar[-1] - - if opcode == 'F': - reversed_cigars.append("%dF" % fusion_left) - else: - if str.islower(opcode): - opcode = str.upper(opcode) - else: - opcode = str.lower(opcode) - - cigar = cigar[:-1] + opcode - reversed_cigars.append(cigar) - - if dir == 'ff': - dir = 'rr' - elif dir == 'rr': - dir = 'ff' - - return dir, antisense == False, left_pos, right_pos, reverse_complement(seq), qual[::-1], reversed_cigars - - if fusion_read and chr1 == chr2_ref and chr2 == chr1_ref and pos1 == pos2_ref and pos2 == pos1_ref: - dir, antisense, left_pos, right_pos, seq, qual, cigars = \ - reverse_read(fusion_read, dir, antisense, left_pos, right_pos, pos1, seq, qual, cigars) - - chr1, chr2 = chr2, chr1 - pos1, pos2 = pos2, pos1 - - if (fusion_read and chr1 == chr1_ref and chr2 == chr2_ref) or \ - (not fusion_read and (chr1 == chr1_ref or chr1 == chr2_ref)): - if not fusion_read: - do_not_continue = False - if chr1 == chr1_ref: - if abs(left_pos - pos1_ref) <= 10000 or abs(right_pos - pos1_ref) <= 10000: - do_not_continue = True - - if chr1 == chr2_ref: - if abs(left_pos - pos2_ref) <= 10000 or abs(right_pos - pos2_ref) <= 10000: - do_not_continue = True - - if not do_not_continue: - continue - - seq_temp = seq - qual_temp = qual - cigars_temp = cigars - left_pos_temp, right_pos_temp = left_pos, right_pos - - before = True - if fusion_read: - if dir != dir_ref: - continue - - if pos1 != pos1_ref or pos2 != pos2_ref: - continue - else: - do_not_continue = False - if chr1 == chr1_ref: - if dir_ref[0] == "f" and not (right_pos - pos1_ref >= 10 or pos1_ref - right_pos >= within): - do_not_continue = True - if dir_ref[0] == "r" and not (pos1_ref - left_pos >= 10 or left_pos - pos1_ref >= within): - do_not_continue = True - - if chr1 == chr2_ref: - if dir_ref[1] == "f" and not (pos2_ref - left_pos >= 10 or left_pos - pos2_ref >= within): - do_not_continue = True - before = False - if dir_ref[1] == "r" and not (right_pos - pos2_ref >= 10 or pos2_ref - right_pos >= within): - do_not_continue = True - before = False - - if not do_not_continue: - continue - - if (before and dir_ref[0] == "r") or (not before and dir_ref[1] == "r"): - dir, antisense, left_pos_temp, right_pos_temp, seq_temp, qual_temp, cigars_temp = \ - reverse_read(fusion_read, dir, antisense, left_pos_temp, right_pos_temp, 0, seq_temp, qual_temp, cigars_temp) - - reads_list[i].append([read_id, chr1, chr2, before, left_pos_temp, right_pos_temp, cigars_temp, seq_temp, qual_temp, mismatch, left, line]) - reads_compress_list[i][1] += 1 - - - for i in range(len(alignments_list)): - output_region(sample_name, alignments_list[i], reads_list[i]) - - print >> sys.stderr, "[%s] Generating read distributions around fusions" % right_now() - - file_name = output_dir + "potential_fusion.txt" - file = open(file_name, 'r') - - alignments_dir = output_dir + "read_alignments" - alignments_list = {} - line_no = 0 - for line in file: - if line_no % 6 == 0: - temp_list = line[:-1].split(' ') - sample_name = temp_list[0] - - if not sample_name in alignments_list: - alignments_list[sample_name] = [] - - if not os.path.exists("%s/%s_%s" % (alignments_dir, sample_name, "_".join(temp_list[1:5]))): - alignments_list[sample_name].append(' '.join(temp_list[1:5])) - - line_no += 1 - - file.close() - - if not os.path.exists(alignments_dir): - os.system("mkdir %s" % alignments_dir) - - pids = [0 for i in range(params.num_threads)] - - for sample_name, list in alignments_list.items(): - bam_file_name = 'tophat_%s/accepted_hits.bam' % sample_name - if not os.path.exists(bam_file_name): - continue - - increment = 50 - for i in range((len(list) + increment - 1) / increment): - temp_list = list[i*increment:(i+1)*increment] - print >> sys.stderr, '\t%s (%d-%d)' % (sample_name, i*increment + 1, min((i+1)*increment, len(list))) - - alignments_list = [] - for fusion in temp_list: - print >> sys.stderr, '\t\t%s' % fusion - fusion = fusion.split() - alignments_list.append([fusion[0], int(fusion[1]), int(fusion[2]), fusion[3]]) - - def work(): - if len(alignments_list) > 0: - alignments_region(sample_name, bam_file_name, alignments_list) - - if params.num_threads <= 1: - work() - else: - parallel_work(pids, work) - - if params.num_threads > 1: - wait_pids(pids) - - - -def generate_html(params): - def blast_output(database, seq): - blast_output_filename = "%s/%s" % (database, seq) - if os.path.exists(blast_output_filename): - file = open(blast_output_filename, "r") - return file.read() + "\n" - - return "" - - def read_fusion_genes(fusion_gene_list): - sample_names = [] - sample_list_filename = output_dir + 'sample_list.txt' - if not os.path.exists(sample_list_filename): - return - - sample_list_file = open(sample_list_filename, "r") - for line in sample_list_file: - sample_names.append(line[:-1]) - sample_list_file.close() - - for sample_name in sample_names: - sample_isoform_filename = "tophat_" + sample_name + "/transfuse.txt" - if not os.path.exists(sample_isoform_filename): - continue - - fusion_gene_begin_index = len(fusion_gene_list) - sample_isoform_file = open(sample_isoform_filename, "r") - for line in sample_isoform_file: - line = line[:-1].lstrip() - fields = line.split('\t') - - def read_coords(value_dic, value_str): - chr1, chr2 = "", "" - if ' ' in value_str: - chr1, chr2 = value_str.split(' ') - else: - chr1 = value_str - - chr1, chr1_coord = chr1.rsplit(':', 1) - chr1_left, chr1_right = chr1_coord.split('-') - chr1_left, chr1_right = int(chr1_left), int(chr1_right) - value_dic["chr1"] = chr1 - value_dic["chr1_left"] = chr1_left - value_dic["chr1_right"] = chr1_right - - if chr2: - chr2, chr2_coord = chr2.rsplit(':', 1) - chr2_left, chr2_right = chr2_coord.split('-') - chr2_left, chr2_right = int(chr2_left), int(chr2_right) - value_dic["chr2"] = chr2 - value_dic["chr2_left"] = chr2_left - value_dic["chr2_right"] = chr2_right - - def read_values(value_dic, value_str): - values = fields[-1].split(';') - - for value in values: - value = value.strip() - if not value: - continue - - name, value = value.split(' ') - - value_dic[name] = value[1:-1] - - type = fields[0] - if type == "gene": - value_dic = {} - read_coords(value_dic, fields[1]) - read_values(value_dic, fields[-1]) - - fusion_gene_list.append([sample_name, value_dic, []]) - - elif type == "transcript": - value_dic = {} - read_coords(value_dic, fields[1]) - read_values(value_dic, fields[-1]) - - # daehwan - for debugging purposes - if float(value_dic["FPKM"]) >= 1.0: - skip_transcript = False - curr_gene = fusion_gene_list[-1] - transcript_list = curr_gene[-1] - transcript_list.append([value_dic, []]) - else: - skip_transcript = True - - elif type in ["exon", "fusion"]: - if skip_transcript: - continue - - curr_gene = fusion_gene_list[-1] - transcript_list = curr_gene[-1] - curr_transcript = transcript_list[-1] - exon_list = curr_transcript[-1] - - if type == "exon": - chr, chr_coord = fields[1].rsplit(':', 1) - chr_left, chr_right = chr_coord.split('-') - chr_left, chr_right = int(chr_left), int(chr_right) - exon_list.append(["exon", chr, chr_left, chr_right]) - else: - fusion_left, fusion_right = fields[1].split(' ') - chr1, chr1_pos = fusion_left.rsplit(':', 1) - chr2, chr2_pos = fusion_right.rsplit(':', 1) - - chr1_pos, chr2_pos = int(chr1_pos), int(chr2_pos) - exon_list.append(["fusion", chr1, chr1_pos, chr2, chr2_pos]) - - transcript_coverage_dic = {} - sample_isoform_cov_filename = "tophat_" + sample_name + "/transfuse_cov.txt" - if os.path.exists(sample_isoform_cov_filename): - sample_isoform_cov_file = open(sample_isoform_cov_filename, "r") - for line in sample_isoform_cov_file: - if line[0] == "C": - transcript_id = line.split("\t")[0] - transcript_coverage_dic[transcript_id] = [] - else: - last_value = line.strip().split()[-1] - base_cov_float = float(last_value) - if math.isnan(base_cov_float): - base_cov_float = 0.1 - base_cov = int(base_cov_float + 0.99) - transcript_coverage_dic[transcript_id].append(base_cov) - - sample_isoform_cov_file.close() - - - for fusion_gene in fusion_gene_list[fusion_gene_begin_index:]: - sample_name, gene_value_dic, transcripts = fusion_gene - max_base_cov = 0 - - saw_fusion_transcript = False - for transcript in transcripts: - transcript_value_dic, exons = transcript - transcript_id = transcript_value_dic["transcript_id"] - - if transcript_id in transcript_coverage_dic: - transcript_coverage = transcript_coverage_dic[transcript_id] - else: - transcript_coverage = None - - saw_fusion = False - exon_offset = 0 - for exon in exons: - if exon[0] == "fusion": - saw_fusion = True - continue - - exon_length = exon[3] - exon[2] + 1 - - if transcript_coverage: - coverage = transcript_coverage[exon_offset:exon_offset + exon_length] - else: - coverage = [0 for z in range(exon_length)] - - exon.append(coverage) - - for base_cov in exon[-1]: - if base_cov > max_base_cov: - max_base_cov = base_cov - - exon_offset += exon_length - - transcript_value_dic["is_fusion_transcript"] = saw_fusion - if saw_fusion: - saw_fusion_transcript = True - - if not saw_fusion and saw_fusion_transcript: - transcript_value_dic["is_after_fusion"] = True - else: - transcript_value_dic["is_after_fusion"] = False - - for transcript in transcripts: - transcript_value_dic, exons = transcript - transcript_value_dic["max_base_cov"] = max_base_cov - - sample_isoform_file.close() - - def find_fusion_gene(fusion_gene_list, sample_name, chr1, chr1_coord, chr2, chr2_coord): - # slow linear search through fusion_gene_list - fusion_gene_info = "" - for fusion_gene in fusion_gene_list: - _sample_name = fusion_gene[0] - value_dic = fusion_gene[1] - _chr1, _chr1_left, _chr1_right, _chr2, _chr2_left, _chr2_right = \ - value_dic["chr1"], value_dic["chr1_left"], value_dic["chr1_right"], \ - value_dic["chr2"], value_dic["chr2_left"], value_dic["chr2_right"] - - if sample_name != _sample_name: - continue - - if chr1 != _chr1 or chr2 != _chr2: - continue - - if chr1_coord < _chr1_left - 2 or chr1_coord > _chr1_right + 2: - continue - - if chr2_coord < _chr2_left -2 or chr2_coord > _chr2_right + 2: - continue - - transcripts = fusion_gene[-1] - for transcript in transcripts: - exons = transcript[-1] - for exon in exons: - if exon[0] != "fusion": - continue - - if chr1_coord == exon[2] and chr2_coord == exon[4]: - return fusion_gene - - return None - - - def read_fusion_list(fusion_list): - re_find_chromosomes = re.compile(r'Homo sapiens chromosome (\d+|[XY])') - re_find_identities = re.compile(r'Identities = (\d+)\/\d+ \((\d+)%\)') - re_find_exon = re.compile(r'exon\d+\((\d+-\d+)\)') - - line_no = 0 - do_not_add = False - output = {} - file_name = output_dir + "potential_fusion.txt" - file = open(file_name, 'r') - for line in file: - if line_no % 6 == 0: - if output and not do_not_add: - fusion_list.append(output) - - do_not_add = False - temp_list = line[:-1].split(' ') - sample_name = temp_list[0] - - chr = temp_list[1] - chr1, chr2 = chr.split('-') - left = int(temp_list[2]) - right = int(temp_list[3]) - dir = temp_list[4] - - output = {} - output["sample_name"] = sample_name - output["chr"] = chr - output["chr1"] = chr1 - output["chr2"] = chr2 - output["left_coord"] = left - output["right_coord"] = right - output["dir"] = dir - output["stats"] = temp_list[5:] - - elif line_no % 6 == 1: - output["left_seq"] = line[:-1].split(" ")[0] - - elif line_no % 6 == 2: - output["right_seq"] = line[:-1].split(" ")[1] - - left_seq = output["left_seq"] - right_seq = output["right_seq"] - seq = left_seq + right_seq - - temp_output = blast_output(output_dir + "blast_genomic", left_seq) - left_chromosomes = set(re_find_chromosomes.findall(temp_output)) - - temp_output = blast_output(output_dir + "blast_genomic", right_seq) - right_chromosomes = set(re_find_chromosomes.findall(temp_output)) - - if string.find(output["sample_name"], "single") == -1: - chr1, chr2 = output["chr"].split("-") - if chr1 != chr2 and left_chromosomes & right_chromosomes: - do_not_add = True - - temp_output = blast_output(output_dir + "blast_genomic", seq) + blast_output(output_dir + "blast_nt", seq) - for identity in re_find_identities.findall(temp_output): - query = int(identity[0]) - percent = int(identity[1]) - if query + percent > 160: - do_not_add = True - break - - elif line_no % 6 == 3: - output["depth"] = line[:-1] - - elif line_no % 6 == 4: - output["gene"] = line[:-1] - temp = output["gene"].split() - output["gene1"], output["gene2"] = temp[0], temp[2] - - else: - pair_str = line.strip() - if len(pair_str) > 0: - pairs = pair_str.split(" ") - else: - pairs = [] - - output["pair_coords"] = pairs - - read_output = "" - read_output_file = output_dir + "read_alignments/%s_%s_%d_%d_%s" % (output["sample_name"], output["chr"], - output["left_coord"], output["right_coord"], output["dir"]) - - if not do_not_add and os.path.exists(read_output_file): - read_output = open(read_output_file, "r").read() - - def unique(list, left, right): - color_len = 300 - result, lcolor, rcolor = [], [0 for i in range(color_len)], [0 for i in range(color_len)] - keys = [] - F_start, F_end = 1000000, 0 - for item in list: - key = item.split(" ") - if len(key) < 6: - continue - - chr1, chr2, pos1, pos2, cigars = key[:5] - pos1, pos2 = int(pos1), int(pos2) - - def color(list, left, right): - for i in range(left, right): - if i >= len(list): - break - - list[i] += 1 - - if 'F' in cigars: - dist1 = abs(pos1 - left) - dist2 = abs(pos2 - right) - - color(lcolor, 0, dist1) - color(rcolor, 0, dist2) - - elif abs(pos2 - left) < color_len: - color(lcolor, abs(pos2 - left), abs(pos1 - left)) - - elif abs(pos1 - right) < color_len: - color(rcolor, abs(pos1 - right), abs(pos2 - right)) - - if "F" in key[4] or len(keys) == 0 or key[:2] != keys[-1][:2] or abs(int(key[2]) - int(keys[-1][2])) >= 3: - keys.append(key[:3]) - result.append(item) - - if "F" in key[4]: - index = len(result) - 1 - if index < F_start: - F_start = index - - if F_end < index: - F_end = index - - if F_end == 0: - F_start, F_end = 0, len(result) - - temp_result = result[max(0, F_start - 50):(F_end+50)] - result = [] - for i in range(len(temp_result)): - item = temp_result[i].split(" ") - if i == 0: - trim = 0 - for ch in item[40:-1]: - trim += 1 - - item = item[:40] + item[40 + trim:] - result.append(" ".join(item)) - - return result, lcolor, rcolor - - if len(read_output) < 1024 * 1024 * 1024: - read_output = read_output.split("\n") - read_output, lcolor, rcolor = unique(read_output, output["left_coord"], output["right_coord"]) - - def stat(lcolor, rcolor, dist): - lcount, lsum, rcount, rsum = 1, 0, 1, 0 - lgap, lpass, rgap, rpass = 0, False, 0, False - for i in range(dist): - if lcolor[i] > 0: - lcount += 1 - lsum += lcolor[i] - - if lgap > 0: - lpass = True - else: - if not lpass: - lgap += 1 - - if rcolor[i] > 0: - rcount += 1 - rsum += rcolor[i] - - if rgap > 0: - rpass = True - else: - if not rpass: - rgap += 1 - - if not lpass: - lgap = 0 - - if not rpass: - rgap = 0 - - return lcount, lsum / lcount, lgap, rcount, rsum / rcount, rgap - - temp = output["gene"].split() - gene1_loc, gene2_loc = temp[1], temp[3] - - def find_exon_len(gene_loc, coord, dir, is_left): - exon_len = 1000000 - for loc in re_find_exon.findall(gene_loc): - start, end = loc.split('-') - start, end = int(start), int(end) - if (is_left and dir == 'f') or (not is_left and dir == 'r'): - exon_len = coord - start + 1 - else: - exon_len = end - coord + 1 - return exon_len - - return exon_len - - lcount_min, rcount_min, diff_max = 150, 150, 120 - - lcount_exon_min = find_exon_len(gene1_loc, output["left_coord"], output["dir"][0], True) - rcount_exon_min = find_exon_len(gene2_loc, output["right_coord"], output["dir"][1], False) - - lcount_min = min(lcount_min, lcount_exon_min - 20) - rcount_min = min(rcount_min, rcount_exon_min - 20) - diff_max = min(diff_max, abs(lcount_min - rcount_min) + 20) - - if lcount_exon_min < 1000 and rcount_exon_min < 1000: - diff_max = max(diff_max, abs(lcount_exon_min - rcount_exon_min) + 20) - - lcount, lavg, lgap, rcount, ravg, rgap = stat(lcolor, rcolor, len(lcolor)) - if lcount <= lcount_min or rcount <= rcount_min or lgap / lcount > 0.1 or rgap / rcount > 0.1: - if abs(min(lcount, lcount_exon_min) - min(rcount, rcount_exon_min)) > diff_max or lcount < 60 or rcount < 60: - do_not_add = True - - def derivation(color, length, avg): - der = 0.0 - for i in range(length): - diff = 1.0 - float(color[i]) / float(max(1, avg)) - der += diff * diff - - return math.sqrt(der / max(1, length)) - - output["read_output"] = read_output - output["lcount"] = lcount - output["lavg"] = lavg - output["lgap"] = lgap - output["lder"] = lder = derivation(lcolor, lcount_min, lavg) - output["rcount"] = rcount - output["ravg"] = ravg - output["rgap"] = rgap - output["rder"] = rder = derivation(rcolor, rcount_min, ravg) - - stats = output["stats"] - num_read = int(stats[0]) - pair = int(stats[1]) - pair_fusion = int(stats[2]) - anti_read = int(stats[3]) - anti_pair = int(stats[4]) - pair_coords = output["pair_coords"] - - dist = 1000000 - if len(pair_coords) > 0: - pair = 0 - for pair_coord in pair_coords: - pair_coord = pair_coord.split(":") - temp_dist = abs(int(pair_coord[0])) + abs(int(pair_coord[1])) - if temp_dist < dist: - dist = temp_dist - - if temp_dist < 2000: - pair += 1 - - - anti_read += 0.5 - if pair == 0: - rate = num_read / anti_read - else: - rate = pair / anti_read - - fusion_gene = find_fusion_gene(fusion_gene_list, \ - output["sample_name"], \ - output["chr1"], output["left_coord"], \ - output["chr2"], output["right_coord"]) - - output["assembled"] = False - fusion_dic = {} - left_normal_transcript, right_normal_transcript = 0, 0 - if fusion_gene: - sample_name, gene_value_dic, transcripts = fusion_gene - for transcript in transcripts: - transcript_value_dic, exons = transcript - if not transcript_value_dic["is_fusion_transcript"]: - if transcript_value_dic["is_after_fusion"]: - right_normal_transcript = 1 - else: - left_normal_transcript = 1 - - continue - - for exon in exons: - if exon[0] == "fusion": - fusion_item = "%d-%d" % (exon[2], exon[4]) - - if output["left_coord"] == exon[2] and output["right_coord"] == exon[4]: - assembled = True - output["assembled"] = True - - if fusion_item not in fusion_dic: - fusion_dic[fusion_item] = 0 - - fusion_dic[fusion_item] += 1 - - - num_fusions = 0 - num_alternative_splicing_with_same_fusion = 0 - - for key, value in fusion_dic.items(): - num_fusions += 1 - num_alternative_splicing_with_same_fusion += (value - 1) - - # lcount and rcount both are in [0, 300] - max_avg = 300 - score = lcount + rcount + (min(max_avg, lavg) + min(max_avg, ravg)) - abs(lcount - rcount) \ - - min(max_avg, abs(lavg - ravg)) - (lgap + rgap) - (lder + rder) * max_avg - min(dist, 1000) \ - + rate - - if output["assembled"]: - score += 300 + min(num_fusions - 1, 5) * 200 + min(num_alternative_splicing_with_same_fusion, 5) * 100 \ - + left_normal_transcript * 50 + right_normal_transcript * 50 - - output["score"] = score - output["test"] = "lcount: %d, lavg: %d, lgap: %d, lder: %f, rcount: %d, ravg: %d, rgap: %d, rder: %f, dist: %d, score: %f num_fusions: %d num_alternative_splicing_with_same_fusion: %d left_normal_transcript: %d right_normal_transcript: %d" % \ - (lcount, lavg, lgap, lder, rcount, ravg, rgap, rder, dist, score, num_fusions, num_alternative_splicing_with_same_fusion, left_normal_transcript, right_normal_transcript) - - else: - read_output = [r"too many - not shown"] - - if not "read_output" in output: - output["read_output"] = "" - output["lcount"] = 0 - output["lavg"] = 0 - output["lgap"] = 0 - output["lder"] = 0 - output["rcount"] = 0 - output["ravg"] = 0 - output["rgap"] = 0 - output["rder"] = 0 - output["score"] = 0 - output["assembled"] = False - - line_no += 1 - - if output and not do_not_add: - fusion_list.append(output) - - file.close() - - - def cluster_fusion(fusion_list, fusion_gene_list, cluster_list): - cluster_dist = 500000 - - cluster_temp_list = [] - parent = [i for i in range(len(fusion_list))] - - for i in range(len(fusion_list)): - fusion = fusion_list[i] - - cluster = {} - cluster["index"] = [i] - cluster["chr"] = fusion["chr"] - cluster["left1"] = cluster["left2"] = fusion["left_coord"] - cluster["right1"] = cluster["right2"] = fusion["right_coord"] - cluster["dir"] = fusion["dir"] - - cluster_temp_list.append(cluster) - - def parent_index(parent, i): - parent_i = parent[i] - if i == parent_i: - return i - else: - parent[i] = parent[parent_i] - return parent_index(parent, parent[i]) - - for i in range(len(fusion_list) - 1): - parent_i = parent_index(parent, i) - for j in range(i+1, len(fusion_list)): - parent_j = parent_index(parent, j) - - if parent_i == parent_j: - continue - - cluster_i = cluster_temp_list[parent_i] - cluster_j = cluster_temp_list[parent_j] - - if cluster_i["chr"] != cluster_j["chr"] or cluster_i["dir"] != cluster_j["dir"]: - continue - - i_left1 = cluster_i["left1"] - i_left2 = cluster_i["left2"] - - j_left1 = cluster_j["left1"] - j_left2 = cluster_j["left2"] - - if abs(i_left1 - j_left1) > cluster_dist or abs(i_left2 - j_left2) > cluster_dist or \ - abs(i_left2 - j_left1) > cluster_dist or abs(i_left1 - j_left2) > cluster_dist: - continue - - i_right1 = cluster_i["right1"] - i_right2 = cluster_i["right2"] - - j_right1 = cluster_j["right1"] - j_right2 = cluster_j["right2"] - - if abs(i_right1 - j_right1) > cluster_dist or abs(i_right2 - j_right2) > cluster_dist or\ - abs(i_right2 - j_right1) > cluster_dist or abs(i_right1 - j_right2) > cluster_dist: - continue - - cluster_i["left1"] = min(i_left1, j_left1) - cluster_i["left2"] = max(i_left2, j_left2) - - cluster_i["right1"] = min(i_right1, j_right1) - cluster_i["right2"] = max(i_right2, j_right2) - - parent[j] = i - cluster_i["index"].extend(cluster_j["index"]) - - - cluster_temp_list2 = [] - for i in range(len(parent)): - if i == parent[i]: - cluster_temp_list2.append(cluster_temp_list[i]) - - def num_samples(indices): - samples = [] - for i in indices: - sample = fusion_list[i]["sample_name"] - if not sample in samples: - samples.append(sample) - - return len(samples) - - def cmp(a, b): - def pair_count(indices): - final_score = -1000000.0 - for index in indices: - score = fusion_list[index]["score"] - if score > final_score: - final_score = score - - return int(final_score) - - a_indices = a["index"] - b_indices = b["index"] - - a_gene, b_gene = 0, 0 - def known_genes(indices): - num = 0 - for index in indices: - temp_num = 0 - fusion = fusion_list[index] - - if fusion["gene1"] != "N/A": - temp_num += 1 - - if fusion["gene2"] != "N/A": - temp_num += 1 - - if temp_num > num: - num = temp_num - - return num - - a_gene = known_genes(a_indices) - b_gene = known_genes(b_indices) - - if a_gene != b_gene: - return b_gene - a_gene - - a_num_sample, b_num_sample = num_samples(a_indices), num_samples(b_indices) - a_score = pair_count(a_indices) - b_score = pair_count(b_indices) - - return b_score - a_score - - cluster_temp_list = sorted(cluster_temp_list2, cmp=cmp) - - for i in range(min(params.max_num_fusions, len(cluster_temp_list))): - do_not_add = False - indices = cluster_temp_list[i]["index"] - if not do_not_add: - def cmp(a, b): - return int(fusion_list[b]["score"] - fusion_list[a]["score"]) - - cluster_temp_list[i]["index"] = sorted(cluster_temp_list[i]["index"], cmp=cmp) - cluster_list.append(cluster_temp_list[i]) - - def generate_html_impl(fusion_list, cluster_list, fusion_gene_list): - html_file_name = output_dir + 'result.html' - txt_file_name = output_dir + 'result.txt' - html_file = open(html_file_name, 'w') - txt_file = open(txt_file_name, 'w') - - html_prev = [] - javascript = [] - html_body = [] - html_post = [] - - def cmp(i, j): - i_left = fusion_list[i]["left_coord"] - j_left = fusion_list[j]["left_coord"] - if i_left == j_left: - return i - j - else: - return i_left - j_left - - indices_list = [] - for c in cluster_list: - indices_list += c["index"] - - indices_list = sorted(indices_list) - print >> sys.stderr, "\tnum of fusions: %d" % len(cluster_list) - - if params.tex_table: - tex_table_file_name = output_dir + 'table.tex' - tex_table_file = open(tex_table_file_name, 'w') - print >> tex_table_file, r'\documentclass{article}' - print >> tex_table_file, r'\usepackage{graphicx}' - print >> tex_table_file, r'\begin{document}' - print >> tex_table_file, r'\pagestyle{empty}' - print >> tex_table_file, r"\center{\scalebox{0.7}{" - print >> tex_table_file, r"\begin{tabular}{| c | c | c | c | c | c | c |}" - print >> tex_table_file, r"\hline" - print >> tex_table_file, r"SAMPLE ID & Fusion genes (left-right) & Chromosomes (left-right) & " + \ - r"5$'$ position & 3$'$ position & Spanning reads & Spanning pairs \\" - - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'result') - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'') - html_prev.append(r'') - html_body.append(r'') - - html_post.append(r'


Candidate fusion list

') - html_post.append(r'The following tables show fusion candidates where fusions are grouped based on their genomic locations (table description).
') - - num_fusions = 0 - for c in range(len(cluster_list)): - cluster = cluster_list[c] - indices = cluster["index"] - - num_fusions += len(indices) - - top_index = indices[0] - indices = sorted(indices, cmp=cmp) - - chr1, chr2 = cluster["chr"].split('-') - - if params.tex_table: - print >> tex_table_file, r'\hline' - - html_post.append(r'


') - html_post.append(r'%d. %s %s' % (c+1, cluster["chr"], cluster["dir"])) - - html_post.append(r'') - for i in indices: - fusion = fusion_list[i] - sample_name = fusion["sample_name"] - # sample_name = string.split(sample_name, '_')[0] - - output = "" - stats = fusion["stats"] - stats = [int(stat) for stat in stats] - - if stats[1] > 0: - pair_support_html = r"\htmlref{%d}{pair_%d}" % (stats[1], i) - else: - pair_support_html = r"0" - - if params.tex_table: - if sample_name == "lane1": - sample_name2 = "thyroid" - else: - sample_name2 = "testes" - - print >> tex_table_file, r'%s & %s & %s & %d & %d & %d & %d \\' % \ - (sample_name2, - fusion["gene1"] + "-" + fusion["gene2"], - chr1[3:] + "-" + chr2[3:], - fusion["left_coord"], - fusion["right_coord"], - stats[0], - stats[1] + stats[2]) - - print >> txt_file, '%s\t%s\t%s\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%.2f' % \ - (sample_name, - fusion["gene1"], - chr1, - fusion["left_coord"], - fusion["gene2"], - chr2, - fusion["right_coord"], - stats[0], - stats[1], - stats[2], - fusion["score"]) - - html_post.append(r'' % (i, sample_name)) - html_post.append(r'' % fusion["gene1"]) - html_post.append(r'' % chr1) - html_post.append(r'' % fusion["left_coord"]) - html_post.append(r'' % fusion["gene2"]) - html_post.append(r'' % chr2) - html_post.append(r'' % fusion["right_coord"]) - html_post.append(r'' % (i, stats[0])) - html_post.append(r'' % (i, stats[1])) - html_post.append(r'' % stats[2]) - - # daehwan - for debugging purposes - # html_post.append(r'' % fusion["score"]) - # html_post.append(r'' % fusion["test"]) - - fusion_gene = find_fusion_gene(fusion_gene_list, sample_name, chr1, fusion["left_coord"], chr2, fusion["right_coord"]) - fusion["fusion_gene"] = fusion_gene - if fusion_gene and fusion["assembled"]: - html_post.append(r'' % i) - - html_post.append(r'') - - - html_post.append(r'
%s%s%s%d%s%s%d%d%d%d%.2f%sisoforms
') - - """ - # sample - html_post.append(r'


sample list

') - html_post.append(r'') - html_post.append(r'') - html_post.append(r'') - html_post.append(r'') - html_post.append(r'') - - if os.path.exists("sample_info.txt"): - sample_file = open("sample_info.txt", "r") - for line in sample_file: - line = line[:-1].split("\t") - html_post.append(r'' % line[0]) - html_post.append(r'' % line[1]) - html_post.append(r'' % line[2]) - html_post.append(r'' % line[3]) - - sample_file.close() - - html_post.append(r'
sample namefragment lengthread length# of fragments
%s%s%s%s
') - """ - - # description - html_post.append(r'


table description

') - html_post.append(r'1. Sample name in which a fusion is identified
') - html_post.append(r'2. Gene on the "left" side of the fusion
') - html_post.append(r'3. Chromosome ID on the left
') - html_post.append(r'4. Coordinates on the left
') - html_post.append(r'5. Gene on the "right" side
') - html_post.append(r'6. Chromosome ID on the right
') - html_post.append(r'7. Coordinates on the right
') - html_post.append(r'8. Number of spanning reads - this is the number of reads that span a fusion point all on their own. In other words, the read itself has a fusion break point within it.
') - html_post.append(r'9. Number of spanning mate pairs - this is the number of pairs of reads where one read maps entirely on the left and the other read maps entirely on the right of the fusion break point. Neither read is split, so these pairs are not counted at all in (8).
') - html_post.append(r'10. Number of spanning mate pairs where one end spans a fusion (reads spanning fusion with only a few bases are included).
') - html_post.append(r'If you follow the the 9th column, it shows coordinates "number1:number2" where one end is located at a distance of "number1" bases from the left genomic coordinate of a fusion and "number2" is similarly defined.') - html_post.append(r'


') - - fusion_gene_drawn = [False for i in range(len(cluster_list))] - - fusion_gene_indices = [] - for i in indices_list: - cluster_index = -1 - for c in range(len(cluster_list)): - cluster_indices = cluster_list[c]["index"] - if i in cluster_indices: - cluster_index = c - break - - fusion = fusion_list[i] - sample_name = fusion["sample_name"] - chr = fusion["chr"] - left = fusion["left_coord"] - right = fusion["right_coord"] - dir = fusion["dir"] - - chr1, chr2 = chr.split("-") - - left_seq = fusion["left_seq"] - right_seq = fusion["right_seq"] - seq = left_seq + right_seq - - pair_coords = fusion["pair_coords"] - stats = fusion["stats"] - stats = [int(stat) for stat in stats] - - html_post.append(r'' % i) - html_post.append(r'' % (sample_name)) - html_post.append(r'' % fusion["gene1"]) - html_post.append(r'' % chr1) - html_post.append(r'' % fusion["left_coord"]) - html_post.append(r'' % fusion["gene2"]) - html_post.append(r'' % chr2) - html_post.append(r'' % fusion["right_coord"]) - html_post.append(r'' % (i, stats[0])) - html_post.append(r'' % (i, stats[1])) - html_post.append(r'' % stats[2]) - html_post.append(r'') - - dir_str = "" - if dir[0] == "f": - dir_str += "-------->" - else: - dir_str += "<--------" - - dir_str += " " - - if dir[1] == "f": - dir_str += "-------->" - else: - dir_str += "<--------" - - html_post.append(r'' % dir_str) - html_post.append(r'

%s

%s

%s

%d

%s

%s

%d

%d

%d

%d

%s

') - html_post.append(r'

') - - """ - html_post.append(r'
')
-            html_post.append(r'%s %s' % (left_seq, right_seq))
-            html_post.append(r'%s' % fusion["depth"])
-            html_post.append(r'%s' % fusion["gene"])
-            html_post.append(r'
') - """ - - fusion_gene = fusion["fusion_gene"] - if fusion_gene: - fusion_gene_indices.append([i, cluster_index]) - - fg_value_dic, transcripts = fusion_gene[1], fusion_gene[2] - html_post.append(r'

%s-%s

' % (i, fusion["gene1"], fusion["gene2"])) - - if not fusion_gene_drawn[cluster_index]: - javascript.append(r'') - - fusion_gene_drawn[cluster_index] = True - - left_blast_genomic = blast_output(output_dir + "blast_genomic", left_seq) - right_blast_genomic = blast_output(output_dir + "blast_genomic", right_seq) - blast_genomic = blast_output(output_dir + "blast_genomic", seq) - - left_blast_nt = blast_output(output_dir + "blast_nt", left_seq) - right_blast_nt = blast_output(output_dir + "blast_nt", right_seq) - blast_nt = blast_output(output_dir + "blast_nt", seq) - - html_post.append(r'') - html_post.append(r'
Left flanking sequenceRight flanking sequence
%s%s
' % (left_seq, right_seq)) - - html_post.append(r'

blast search - genome

') - html_post.append(r'

left flanking sequence - %s

' % left_seq) - html_post.append(r'
%s
' % left_blast_genomic) - html_post.append(r'

right flanking sequence - %s

' % right_seq) - html_post.append(r'
%s
' % right_blast_genomic) - - html_post.append(r'

blast search - nt

') - html_post.append(r'

left flanking sequence - %s

' % left_seq) - html_post.append(r'
%s
' % left_blast_nt) - html_post.append(r'

right flanking sequence - %s

' % right_seq) - html_post.append(r'
%s
' % right_blast_nt) - - html_post.append(r'


reads

' % i) - read_output = fusion["read_output"] - html_post.append(r'
%s
' % '\n'.join(read_output)) - - html_post.append(r'


%d pairs

' % (i, len(pair_coords))) - html_post.append(r'
%s
' % '\n'.join(pair_coords)) - html_post.append(r'


') - - html_post.append(r'') - html_post.append(r'') - - fusion_gene_draw_str = "" - for index in fusion_gene_indices: - isoform_index, cluster_index = index - fusion_gene_draw_str += "fusion_gene_draw%d(%d);" % (cluster_index, isoform_index) - - html_body.append(r'' % fusion_gene_draw_str) - - for line in html_prev + javascript + html_body + html_post: - print >> html_file, line - - html_file.close() - txt_file.close() - - if params.tex_table: - print >> tex_table_file, r"\hline" - print >> tex_table_file, r"\end{tabular}}}" - print >> tex_table_file, r'\end{document}' - tex_table_file.close() - os.system("pdflatex --output-directory=%s %s" % (output_dir, tex_table_file_name)) - - print >> sys.stderr, "[%s] Reporting final fusion candidates in html format" % right_now() - - # Cufflinks-Fusion - fusion_gene_list = [] - read_fusion_genes(fusion_gene_list) - - fusion_list = [] - read_fusion_list(fusion_list) - - cluster_list = [] - cluster_fusion(fusion_list, fusion_gene_list, cluster_list) - - generate_html_impl(fusion_list, cluster_list, fusion_gene_list) - - - -# Format a DateTime as a pretty string. -# FIXME: Currently doesn't support days! -def formatTD(td): - hours = td.seconds // 3600 - minutes = (td.seconds % 3600) // 60 - seconds = td.seconds % 60 - return '%02d:%02d:%02d' % (hours, minutes, seconds) - - -# Generate a new temporary filename in the user's tmp directory -def tmp_name(): - tmp_root = tmp_dir - if os.path.exists(tmp_root): - pass - else: - os.mkdir(tmp_root) - return tmp_root + os.tmpnam().split('/')[-1] - - -def die(msg=None): - if msg is not None: - print >> sys.stderr, msg - sys.exit(1) - - -# rough equivalent of the 'which' command to find external programs -# (current script path is tested first, then PATH envvar) -def which(program): - def is_executable(fpath): - return os.path.exists(fpath) and os.access(fpath, os.X_OK) - fpath, fname = os.path.split(program) - if fpath: - if is_executable(program): - return program - else: - progpath = os.path.join(bin_dir, program) - if is_executable(progpath): - return progpath - for path in os.environ["PATH"].split(os.pathsep): - progpath = os.path.join(path, program) - if is_executable(progpath): - return progpath - return None - - -def prog_path(program): - progpath = which(program) - if progpath == None: - die("Error locating program: "+program) - return progpath - - -def get_version(): - return "2.1.0-cgpPatched" - - -def main(argv=None): - warnings.filterwarnings("ignore", "tmpnam is a potential security risk") - - # Initialize default parameter values - params = TopHatFusionParams() - try: - if argv is None: - argv = sys.argv - args = params.parse_options(argv) - params.check() - - bwt_idx_prefix = args[0] - - print >> sys.stderr, "[%s] Beginning TopHat-Fusion post-processing run (v%s)" % (right_now(), get_version()) - print >> sys.stderr, "-----------------------------------------------" - - start_time = datetime.now() - prepare_output_dir() - sample_updated = check_samples() - - if not params.skip_fusion_kmer: - map_fusion_kmer(bwt_idx_prefix, params, sample_updated) - - if not params.skip_filter_fusion: - filter_fusion(bwt_idx_prefix, params) - - if not params.skip_blast: - do_blast(params) - - if not params.skip_read_dist: - read_dist(params) - - if not params.skip_html: - generate_html(params) - - global run_log - run_log = open(logging_dir + "run.log", "w", 0) - global run_cmd - run_cmd = " ".join(argv) - print >> run_log, run_cmd - - finish_time = datetime.now() - duration = finish_time - start_time - print >> sys.stderr,"-----------------------------------------------" - print >> sys.stderr, "[%s] Run complete [%s elapsed]" % (right_now(), formatTD(duration)) - - except Usage, err: - print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) - print >> sys.stderr, "\tfor detailed help see http://tophat-fusion.sourceforge.net/manual.html" - return 2 - - -if __name__ == "__main__": - sys.exit(main()) +--- tophat-fusion-post 2019-10-17 16:06:48.338589724 +0000 ++++ tophat-fusion-post.new 2019-10-17 16:08:30.131237030 +0000 +@@ -788,7 +788,7 @@ + + same = False + for chr_coord in left_chrs: +- chr, coord = chr_coord.split(':') ++ chr, coord = chr_coord.rsplit(':', 1) + coord = int(coord) + if chr == chr2 and abs(coord - coord2) < max_intron_len: + same = True +@@ -798,7 +798,7 @@ + continue + + for chr_coord in right_chrs: +- chr, coord = chr_coord.split(':') ++ chr, coord = chr_coord.rsplit(':', 1) + coord = int(coord) + if chr == chr1 and abs(coord - coord1) < max_intron_len: + same = True +@@ -1225,7 +1225,7 @@ + if line[0] == '@': + if line[1:3] == 'SQ': + line = line[:-1].split('\t') +- contig = line[1].split(':')[1] ++ contig = line[1].split(':', 1)[1] + if contig not in contigs: + contigs[contig] = len(contigs) + +@@ -1533,7 +1533,7 @@ + else: + chr1 = value_str + +- chr1, chr1_coord = chr1.split(':') ++ chr1, chr1_coord = chr1.rsplit(':', 1) + chr1_left, chr1_right = chr1_coord.split('-') + chr1_left, chr1_right = int(chr1_left), int(chr1_right) + value_dic["chr1"] = chr1 +@@ -1541,7 +1541,7 @@ + value_dic["chr1_right"] = chr1_right + + if chr2: +- chr2, chr2_coord = chr2.split(':') ++ chr2, chr2_coord = chr2.rsplit(':', 1) + chr2_left, chr2_right = chr2_coord.split('-') + chr2_left, chr2_right = int(chr2_left), int(chr2_right) + value_dic["chr2"] = chr2 +@@ -1592,14 +1592,14 @@ + exon_list = curr_transcript[-1] + + if type == "exon": +- chr, chr_coord = fields[1].split(':') ++ chr, chr_coord = fields[1].rsplit(':', 1) + chr_left, chr_right = chr_coord.split('-') + chr_left, chr_right = int(chr_left), int(chr_right) + exon_list.append(["exon", chr, chr_left, chr_right]) + else: + fusion_left, fusion_right = fields[1].split(' ') +- chr1, chr1_pos = fusion_left.split(':') +- chr2, chr2_pos = fusion_right.split(':') ++ chr1, chr1_pos = fusion_left.rsplit(':', 1) ++ chr2, chr2_pos = fusion_right.rsplit(':', 1) + + chr1_pos, chr2_pos = int(chr1_pos), int(chr2_pos) + exon_list.append(["fusion", chr1, chr1_pos, chr2, chr2_pos]) +@@ -2865,7 +2865,7 @@ + + + def get_version(): +- return "2.1.0" ++ return "2.1.0-cgpPatched" + + + def main(argv=None):