-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_variants.py
139 lines (99 loc) · 3.41 KB
/
clean_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
"""
#
# Keep only single mutation SNPs, Remove all deletions (*) and SNPs with >1 ALT from a variants table (converted from a vcf using GATK).
#
# Usage: clean_variants.py -i <genotype_table> -o <output_name>
#
# Where:
# genotype_table = table generated by the VariantsToTable tool from GATK (https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantutils_VariantsToTable.php)
# output_name = name to save the resulting file
#
# Options: -h for usage help
#
"""
import re, sys, getopt
# Check for the arguments, open the inputs and print useful help messages
try:
opts, args = getopt.getopt(sys.argv[1:],"hi:o:")
except getopt.GetoptError:
print '\n', '#### Invalid use ####', '\n'
print 'Usage: clean_variants.py -i <genotype_table> -o <output_name>'
print 'For help use clean_variants.py -h'
sys.exit(99)
for opt, arg in opts:
if opt == '-h':
print '\n', 'Remove all deletions (*) and SNPs with >1 ALT from a variants table (converted from a vcf using GATK). ', '\n'
print 'Usage: clean_variants.py -i <genotype_table> -o <output_name>', '\n'
print 'Where: genotype_table = table generated by the VariantsToTable tool from GATK (https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantutils_VariantsToTable.php)'
print 'output_name = name to save the resulting file'
print 'Options: -h for help'
sys.exit()
elif len(arg) >= 2:
if opt in ("-i"):
raw_variants = open(arg)
if opt in ("-o"):
output = open(arg,"w")
else:
assert False, "unhandled option"
### Create useful variables
h = 0
alt_pos = 0
lst_ind_pos = []
ind_gen = ""
a1 = ""
already = False
duplicated = False
### Cleaning the genotype file
for line in raw_variants:
# Get position of the individuals GT and ALT
if h == 0:
header = line.split("\t")
alt_pos = header.index("ALT")
ref_pos = header.index("REF")
for col in header:
if col.endswith(".GT"):
ind_pos = header.index(col)
lst_ind_pos.append(ind_pos)
output.write(line)
h = 1
# Check if there is a deletion in line
elif "*" not in line:
leave = False
info = line.split("\n")
info = info[0]
info = info.split("\t")
alt = info[alt_pos]
alt_len = alt.split(",")
ref = info[ref_pos]
# Check if all possible alterations are SNPs
if (len(ref) != 1):
continue
for variants in alt_len:
alt_n = variants
if (len(alt_n) != 1) :
leave = True
continue
if leave is True:
continue
# Check if there are more than 2 ALT
elif len(alt) == 1 : # if it is only one ALT and it's a single mutation
output.write(line)
continue
else:
alt = alt.split(",")
duplicated = False
already = False
for pos in lst_ind_pos: # For all individuals in the file (for each line)
ind_gen = info[pos]
for i in alt: # For all possible ALT markers
if (i in ind_gen) and (already is False) :
already = True
a1 = i
elif (i in ind_gen) and (already is True) and ( i != a1) :
duplicated = True # mark if more than two ALT are found
if duplicated is False: # if only one alternative is found
info[alt_pos] = a1
new_line = "\t".join(info)
output.write(new_line)
output.close()