-
Notifications
You must be signed in to change notification settings - Fork 35
/
setup_nextgen_tuple_counts.py
117 lines (99 loc) · 4.78 KB
/
setup_nextgen_tuple_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from basic import *
from all_genes import all_genes
with Parser(locals()) as p:
p.int('maxlines').default(10000000)
p.flag('show_vj_counts')
dbfile = pipeline_params['db_file']
if 'alphabeta' in dbfile:
all_logfiles = {
'mouse':
{ 'A':
[ '/home/pbradley/csdat/tcr/paul_thomas/raw_data/sra/SRX1074287_SRX1074288_TCR_LA_MC_PCR/tmp.read_sra_matches.alpha.log',
'/home/pbradley/csdat/tcr/paul_thomas/scjobs_tcrseq_tmp3.read_sra_matches.log' ],
'B':
[ '/home/pbradley/csjobs/tcrseq/tmp1.read_sra_matches.log',
'/home/pbradley/csdat/tcr/paul_thomas/raw_data/sra/SRX1074287_SRX1074288_TCR_LA_MC_PCR/tmp.read_sra_matches.beta.log',
'/home/pbradley/csdat/tcr/paul_thomas/raw_data/sra/friedman_SRP015131_TCR_beta_repertoire/tmp.read_sra_matches.log' ],
},
'human':
{ 'B':
[ '/home/pbradley/scjobs/tcrseq/tmp7.read_sra_matches.log.subjectX_gdna_data_TCRB',
'/home/pbradley/scjobs/tcrseq/tmp7.read_sra_matches.log.subjectY_gdna_data_TCRB',
'/home/pbradley/scjobs/tcrseq/tmp7.read_sra_matches.log.experiment5_gdna_data_BreastPBMC2_TCRB',
'/home/pbradley/scjobs/tcrseq/tmp7.read_sra_matches.log.experiment5_gdna_data_KidneyPBMC3_TCRB' ],
'A':
[ '/home/pbradley/scjobs/tcrseq/tmp8.read_sra_matches.log.subjectX_gdna_data_TCRA',
'/home/pbradley/scjobs/tcrseq/tmp8.read_sra_matches.log.subjectY_gdna_data_TCRA',
'/home/pbradley/scjobs/tcrseq/tmp8.read_sra_matches.log.experiment5_gdna_data_BreastPBMC2_TCRA',
'/home/pbradley/scjobs/tcrseq/tmp8.read_sra_matches.log.experiment5_gdna_data_KidneyPBMC3_TCRA', ],
}
}
elif 'gammadelta' in dbfile:
all_logfiles = {
'human':
{ 'B':
[ '/home/pbradley/scjobs/tcrseq/tmp57.read_sra_matches.log' ],
'A':
[ '/home/pbradley/scjobs/tcrseq/tmp59.read_sra_matches.log' ]
}
}
else: # nothing else yet
exit(1)
for organism in all_logfiles:
outfile = '{}/redo_nextgen_tuple_counts_v2_{}_max10M.log'.format( path_to_current_db_files(), organism )
out = open(outfile,'w')
for chain in all_logfiles[organism]:
print 'making', outfile
for logfile in all_logfiles[organism][chain]:
assert exists(logfile)
counter=0
v_counts = {}
j_counts = {}
vj_counts = {}
srcfile=''
seen = set()
for line in open(logfile,'r'):
l = line.split()
nucseq = l[-2]
if nucseq in seen: continue
seen.add( nucseq )
srcfile = l[-1]
counter+=1
if counter>maxlines:break
if counter%100000==0:
Log(`(srcfile,counter,organism,chain,len(v_counts.keys()),len(j_counts.keys()),logfile)`)
try:
# v_genes = [ x for x in l[-4].split(',')
# if all_genes[organism][x].chain == chain and all_genes[organism][x].region == 'V' ]
# j_genes = [ x for x in l[-3].split(',')
# if all_genes[organism][x].chain == chain and all_genes[organism][x].region == 'J' ]
#j_genes = l[-3].split(',')
v_genes = tuple( sorted( set( all_genes[organism][x].count_rep for x in l[-4].split(',') ) ) )
j_genes = tuple( sorted( set( all_genes[organism][x].count_rep for x in l[-3].split(',') ) ) )
except:
print 'bad line:',l[-4:],line
continue
v_counts[v_genes] = v_counts.get(v_genes,0)+1
j_counts[j_genes] = j_counts.get(j_genes,0)+1
vj_genes = ( v_genes, j_genes )
vj_counts[ vj_genes ] = vj_counts.get( vj_genes,0 )+1
l = [ (y,x) for x,y in v_counts.iteritems() ]
l.sort()
l.reverse()
for count,genes in l:
out.write( 'TUPLE_COUNT {} {} {} {}\n'.format( count, ','.join( genes ), organism, logfile ) )
l = [ (y,x) for x,y in j_counts.iteritems() ]
l.sort()
l.reverse()
for count,genes in l:
out.write( 'TUPLE_COUNT {} {} {} {}\n'.format( count, ','.join( genes ), organism, logfile ) )
if show_vj_counts:
l = [ (y,x) for x,y in vj_counts.iteritems() ]
l.sort()
l.reverse()
for count,(v_genes,j_genes) in l:
out.write( 'TUPLE_COUNT {} {} {} {} {}\n'\
.format( count, ','.join( v_genes ), ','.join( j_genes ), organism,
logfile ) )
out.close()
#exit()