forked from arq5x/gemini
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
utility code to support multiple alts (by splitting) and simplifying …
- Loading branch information
Showing
1 changed file
with
165 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
""" | ||
This implements: | ||
1. allele splitting: ref=A, alt=AT,AAT,ACT becomes 3 separate variants. | ||
Adjusts PL (or GL) and removes AF and DP fields as needed. | ||
2. simplify variants: pos=123, ref=AAT, alt=AT becomes pos=124, ref=AT, alt=T | ||
""" | ||
|
||
from collections import OrderedDict | ||
from os.path import commonprefix | ||
import sys | ||
|
||
INFO_EXCLUDE = ('AF',) | ||
GT_EXCLUDE = ('DP',) | ||
|
||
def infostr(infodict, exclude=INFO_EXCLUDE): | ||
if isinstance(infodict, basestring): | ||
return infodict | ||
s = ";".join(k + (('=' + v) if v is not None else '') for | ||
k, v in infodict.iteritems() if not k in exclude) | ||
return s | ||
|
||
def fix_genos(genos, alt_idx, exclude=GT_EXCLUDE): | ||
""" | ||
>>> genos = [{'GT': ['1', '2'], 'PL': '281,5,9,58,0,115,338,46,116,809', 'DP': '81'}, | ||
... {'GT': ['0', '0'], 'PL': '0,30,323,31,365,483,38,291,325,567', 'DP': '86'}] | ||
>>> fix_genos(genos, 3) | ||
[{'GT': ['.', '.'], 'PL': '281,338,809'}, {'GT': ['0', '0'], 'PL': '0,38,567'}] | ||
>>> fix_genos(genos, 2) | ||
[{'GT': ['.', '1'], 'PL': '281,58,115'}, {'GT': ['0', '0'], 'PL': '0,31,483'}] | ||
>>> fix_genos(genos, 1) | ||
[{'GT': ['1', '.'], 'PL': '281,5,9'}, {'GT': ['0', '0'], 'PL': '0,30,323'}] | ||
""" | ||
assert int(alt_idx) > 0 | ||
# this copy needed since we are modifying in place. | ||
genos = [x.copy() for x in genos] | ||
|
||
alt_idx = str(alt_idx) | ||
for n, geno in enumerate(genos): | ||
# this copy *also* needed since we are modifying in place. | ||
geno['GT'] = geno['GT'][:] | ||
for ex in exclude: | ||
geno.pop(ex, None) | ||
|
||
# pull out only the appropriate likelihoods. | ||
# according to VCF spec, these will be in the order: | ||
# 0/0, 0/1, 1/1, 0/2, 1/2, 2/2 | ||
# ordering given by: Order(j/k) = (k*(k+1)/2)+j. | ||
|
||
a, b = 0, int(alt_idx) | ||
for key in ('PL', 'GL'): | ||
if not key in geno: continue | ||
|
||
vals = geno[key].split(",") | ||
# so do this for (a, a), (a, b), (b, b) | ||
likelihood = [] | ||
for j, k in ((a, a), (a, b), (b, b)): | ||
order = (k * (k + 1) / 2) + j | ||
#print >>sys.stderr, a, b, (j, k), 'order:' + str(order), vals#, 'value:' + str(vals[order]) | ||
likelihood.append(vals[order]) | ||
geno[key] = ",".join(likelihood) | ||
|
||
# set the current allele to 1, keep 0 at 0 and set all others to '.' | ||
for i, allele in enumerate(geno['GT']): | ||
if allele == alt_idx: | ||
geno['GT'][i] = '1' | ||
elif allele != '0': | ||
geno['GT'][i] = '.' | ||
return genos | ||
|
||
def fmt_genos(genos): | ||
for g in genos: | ||
g['GT'] = "/".join(g['GT']) | ||
vals = [":".join(f.values()) for f in genos] | ||
return vals | ||
|
||
|
||
def varsplit(ref, alts, info, frmt, gts): | ||
""" | ||
# from Vt decompose example: | ||
>>> n = varsplit('TA', ['TAA', 'TAAA', 'T'], 'AF=0.2,0.3,0.1;KKKK', | ||
... 'GT:DP:PL', ['1/2:81:281,5,9,58,0,115,338,46,116,809', | ||
... '0/0:86:0,30,323,31,365,483,38,291,325,567']) | ||
>>> next(n) | ||
('TA', 'TAA', 'KKKK', 'GT:PL', ['1/.:281,5,9', '0/0:0,30,323']) | ||
>>> next(n) | ||
('TA', 'TAAA', 'KKKK', 'GT:PL', ['./1:281,58,115', '0/0:0,31,483']) | ||
>>> next(n) | ||
('TA', 'T', 'KKKK', 'GT:PL', ['./.:281,338,809', '0/0:0,38,567']) | ||
""" | ||
if len(alts) == 1: | ||
yield ref, alts[0], infostr(info, ()), frmt, gts | ||
raise StopIteration | ||
|
||
if not isinstance(info, dict): | ||
info = OrderedDict((kv[0], (kv[1] if len(kv) > 1 else None)) for kv in (x.split('=') for x in info.split(';'))) | ||
|
||
fmts = frmt.split(':') | ||
gts = [OrderedDict(zip(fmts, x.split(':'))) for x in gts] | ||
|
||
for i, g in enumerate(gts): | ||
gts[i]['GT'] = g['GT'].split('/') if '/' in g['GT'] else g['GT'].split('|') | ||
|
||
for i, alt in enumerate(alts, start=1): | ||
fixed_genos = fix_genos([x for x in gts], i) | ||
fields = fixed_genos[0].keys() # ordered | ||
yield ref, alt, infostr(info), ":".join(fields), fmt_genos(fixed_genos) | ||
|
||
def normalize(pos, ref, alt): | ||
"""simplify a ref/alt a la vt normalize so that ref=CC, alt=CCT becomes | ||
ref=C, alt=CT. this helps in annotating variants. | ||
>>> normalize(123, 'T', 'C') | ||
(123, 'T', 'C') | ||
>>> normalize(123, 'CC', 'CCT') | ||
(124, 'C', 'CT') | ||
>>> normalize(123, 'TCCCCT', 'CCCCT') | ||
(123, 'TC', 'C') | ||
>>> normalize(123, 'TCCCCTA', 'CCCCT') | ||
(123, 'TCCCCTA', 'CCCCT') | ||
>>> normalize(123, 'TCCCCTA', 'CCCCTA') | ||
(123, 'TC', 'C') | ||
>>> normalize(123, 'AAATCCCCTA', 'AAACCCCTA') | ||
(125, 'AT', 'A') | ||
>>> normalize(123, 'CAAATCCCCTAG', 'AAACCCCTA') | ||
(123, 'CAAATCCCCTAG', 'AAACCCCTA') | ||
""" | ||
if len(ref) == len(alt) == 1: | ||
return pos, ref, alt | ||
|
||
# logic for trimming from either end is the same so we just reverse the | ||
# string to trim the suffix (i == 0) and correct position when doing prefix | ||
# (i == 1). | ||
for i in range(2): | ||
if i == 0: # suffix so flip | ||
ref, alt = ref[::-1], alt[::-1] | ||
|
||
|
||
n, min_len = 0, min(len(ref), len(alt)) | ||
while n + 1 < min_len and alt[n] == ref[n]: | ||
n += 1 | ||
alt, ref = alt[n:], ref[n:] | ||
if i == 0: # flip back | ||
ref, alt = ref[::-1], alt[::-1] | ||
else: # add to position since we stripped the prefix | ||
pos += n | ||
|
||
return pos, ref, alt | ||
|
||
if __name__ == "__main__": | ||
import doctest | ||
doctest.testmod() |