-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_md5.py
40 lines (29 loc) · 913 Bytes
/
check_md5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import glob
from helpers import md5sum
import multiprocessing as mp
import sys
cancer = sys.argv[1]
bam_files = glob.glob(f'/stornext/HPCScratch/PapenfussLab/projects/gdc_download/{cancer}/*.bam')
class FileChecker:
def __init__(self, fn):
self.fn = fn
def __call__(self, *args, **kwargs):
sum_file = os.path.splitext(self.fn)[0] + '.md5'
if not os.path.exists(sum_file):
print(f'# no sum file: {sum_file}')
print(f'rm {self.fn}')
return
with open(sum_file, 'r') as f:
md5_in_sum_file = f.read().strip()
md5_in_bam_file = md5sum(self.fn)
if md5_in_bam_file != md5_in_sum_file:
print(f'# bam md5: {md5_in_bam_file} sum file: {md5_in_sum_file}')
print(f'rm {self.fn}')
p = mp.Pool(20)
checks = []
for bam_file in bam_files:
check = FileChecker(bam_file)
checks.append(p.apply_async(check))
for check in checks:
check.get()