forked from AllanDickerman/sra_express
-
Notifications
You must be signed in to change notification settings - Fork 0
/
py_mongo_VCF.py
90 lines (83 loc) · 2.8 KB
/
py_mongo_VCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from pymongo import MongoClient
from pymongo import errors
import glob
import sys
import math
#import cPickle
ref_genome = "GRCm38.p5" #Name of reference genome
program = "Platypus" #Name of program that output VCF file
#startup MongoDB: Assumes the server is already running
#try port 27020
client = MongoClient('localhost', 27018) #change this for running through slurm
db = client.mytestdb
vcf_collection = db.test_vcf_collection
#read info from file
v = "variants"
vcf_drc = "/home/clark96/MongoDB/test_VCF/*"
#vcf_drc = "/home/clark96/MongoDB/test_VCF/SRR3746168.sangerGeneSnps.vcf"
numPrints = 0
large_files = []
recordThreshold = 475000.0
for f in glob.glob(vcf_drc):
if numPrints < 20:
sys.stdout.write(".")
numPrints = numPrints + 1
else:
sys.stdout.write(".\n")
numPrints = 0
sys.stdout.flush()
expData = []
recordCount = 0
with open(f,"r") as fopen:
for line in fopen:
if line[0] == "#":
continue
parsed = line.rstrip().split("\t")
if parsed[6] == "PASS":
recordCount = recordCount+1
data = []
data.append(parsed[0])
data.append(parsed[1])
data.append(parsed[2])
data.append(parsed[3])
data.append(parsed[4])
data.append(parsed[5])
alleleIndices = parsed[9].split(":")[0]
data.append(alleleIndices)
expData.append(data)
numFiles_needed = math.ceil(recordCount/recordThreshold)
if numFiles_needed == 0:
#input these files
f_data = {}
f_data["run_id"] = run_id
f_data["referenceGenome"] = ref_genome
f_data["program"] = program
f_data["fileNum"] = 1
f_data["totalFiles"] = numFiles_needed
f_data[v] = []
vcf_collection.insert_one(f_data)
continue
parts = f.split("/")
currFile = 1
startIndex = 0
endIndex = int(recordThreshold)
run_id = parts[len(parts)-1].split(".")[0]
#loop through and add necessary number of files
while currFile <= numFiles_needed:
file_data = {}
file_data["run_id"] = run_id
file_data["referenceGenome"] = ref_genome
file_data["program"] = program
file_data["fileNum"] = currFile
file_data["totalFiles"] = numFiles_needed
#add data to file_data entry
file_data[v] = data[startIndex:endIndex]
startIndex = endIndex
endIndex = endIndex + int(recordThreshold)
#add file data to mongo
#print("inserting document")
currFile = currFile + 1
try:
vcf_collection.insert_one(file_data)
except errors.DocumentTooLarge as e:
large_files.append(f)