forked from WEHI-ResearchComputing/wehi-gdc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_pairs.py
95 lines (81 loc) · 2.58 KB
/
count_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sys
import re
import os
import pickle
import glob
#-----------------------------------------------------------------------------
'''
A simple container for files associated with an individual patient
'''
class CaseFileSet:
def __init__(self, output_dir, case_id):
self.file_ids = []
self.file_names = []
self.md5s = []
self.sizes = []
self.submitter_ids = []
self.case_id = case_id
self.output_dir = output_dir
def add(self, file_id, file_name, md5, size, submitter_id):
self.md5s.append(md5)
self.file_ids.append(file_id)
self.file_names.append(os.path.join(self.output_dir, file_name))
self.sizes.append(size)
self.submitter_ids.append(submitter_id)
#-----------------------------------------------------------------------------
tumour_flags = set(['01','02','03','04','05','06','07','08','09','50','60','61'])
normal_flags = set(['10','11','12','13','14','40'])
REGEX = re.compile('.*TCGA\-[^-]+\-[^-]+\-([^-]+)[A-Z]\-[^-]+([A-Z])\-.*')
def make_pairs(files, tags, case_id):
tumour_files = []
normal_files = []
for f, t in zip(files, tags):
bn = os.path.basename(f)
m = REGEX.match(t)
if not m:
continue
if m.group(2) == 'W':
continue
v = m.group(1)
if v in tumour_flags:
tumour_files.append((bn, t))
continue
if v in normal_flags:
normal_files.append((bn, t))
continue
pairs = []
for nf, nt in normal_files:
for tf, tt in tumour_files:
pairs.append((tf, nf, tt, nt, case_id))
return pairs
cancer = sys.argv[1]
with open(f'{cancer}-query.pkl', 'rb') as f:
case_files = pickle.load(f)
pair_cnt = 0
pairs = []
for cf in case_files:
pairs += make_pairs(cf.file_names, cf.submitter_ids, cf.case_id)
pair_cnt = len(pairs)
actual_dirs = []
for d in glob.glob(f'/stornext/HPCScratch/PapenfussLab/projects/tcga-data/{cancer}/*'):
if d == f'/stornext/HPCScratch/PapenfussLab/projects/tcga-data/{cancer}/logs':
continue
if d == f'/stornext/HPCScratch/PapenfussLab/projects/tcga-data/{cancer}/old-logs':
continue
if os.path.isdir(d):
actual_dirs.append(d)
# if pair_cnt < file_cnt:
expected_dirs = set()
completed = 0
for (tf, nf, tt, nt, cid) in pairs:
dn = f'/stornext/HPCScratch/PapenfussLab/projects/tcga-data/{cancer}/{tt}--{nt}'
expected_dirs.add(dn)
if not os.path.exists(dn) or not glob.glob(os.path.join(dn, '*seqz.gz')):
print(f'{cid}')
else:
completed += 1
print(f'expected: {pair_cnt} completed: {completed}')
print('==>> Unexpected dirs:')
for d in actual_dirs:
if not d in expected_dirs:
print(d)