Skip to content

Commit

Permalink
utility code to support multiple alts (by splitting) and simplifying …
Browse files Browse the repository at this point in the history
…ref/alt representation. see arq5x#193 and arq5x#161.
  • Loading branch information
brentp committed Mar 5, 2015
1 parent c63e1ad commit 58f6aca
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions gemini/vcfutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""
This implements:
1. allele splitting: ref=A, alt=AT,AAT,ACT becomes 3 separate variants.
Adjusts PL (or GL) and removes AF and DP fields as needed.
2. simplify variants: pos=123, ref=AAT, alt=AT becomes pos=124, ref=AT, alt=T
"""

from collections import OrderedDict
from os.path import commonprefix
import sys

INFO_EXCLUDE = ('AF',)
GT_EXCLUDE = ('DP',)

def infostr(infodict, exclude=INFO_EXCLUDE):
if isinstance(infodict, basestring):
return infodict
s = ";".join(k + (('=' + v) if v is not None else '') for
k, v in infodict.iteritems() if not k in exclude)
return s

def fix_genos(genos, alt_idx, exclude=GT_EXCLUDE):
"""
>>> genos = [{'GT': ['1', '2'], 'PL': '281,5,9,58,0,115,338,46,116,809', 'DP': '81'},
... {'GT': ['0', '0'], 'PL': '0,30,323,31,365,483,38,291,325,567', 'DP': '86'}]
>>> fix_genos(genos, 3)
[{'GT': ['.', '.'], 'PL': '281,338,809'}, {'GT': ['0', '0'], 'PL': '0,38,567'}]
>>> fix_genos(genos, 2)
[{'GT': ['.', '1'], 'PL': '281,58,115'}, {'GT': ['0', '0'], 'PL': '0,31,483'}]
>>> fix_genos(genos, 1)
[{'GT': ['1', '.'], 'PL': '281,5,9'}, {'GT': ['0', '0'], 'PL': '0,30,323'}]
"""
assert int(alt_idx) > 0
# this copy needed since we are modifying in place.
genos = [x.copy() for x in genos]

alt_idx = str(alt_idx)
for n, geno in enumerate(genos):
# this copy *also* needed since we are modifying in place.
geno['GT'] = geno['GT'][:]
for ex in exclude:
geno.pop(ex, None)

# pull out only the appropriate likelihoods.
# according to VCF spec, these will be in the order:
# 0/0, 0/1, 1/1, 0/2, 1/2, 2/2
# ordering given by: Order(j/k) = (k*(k+1)/2)+j.

a, b = 0, int(alt_idx)
for key in ('PL', 'GL'):
if not key in geno: continue

vals = geno[key].split(",")
# so do this for (a, a), (a, b), (b, b)
likelihood = []
for j, k in ((a, a), (a, b), (b, b)):
order = (k * (k + 1) / 2) + j
#print >>sys.stderr, a, b, (j, k), 'order:' + str(order), vals#, 'value:' + str(vals[order])
likelihood.append(vals[order])
geno[key] = ",".join(likelihood)

# set the current allele to 1, keep 0 at 0 and set all others to '.'
for i, allele in enumerate(geno['GT']):
if allele == alt_idx:
geno['GT'][i] = '1'
elif allele != '0':
geno['GT'][i] = '.'
return genos

def fmt_genos(genos):
for g in genos:
g['GT'] = "/".join(g['GT'])
vals = [":".join(f.values()) for f in genos]
return vals


def varsplit(ref, alts, info, frmt, gts):
"""
# from Vt decompose example:
>>> n = varsplit('TA', ['TAA', 'TAAA', 'T'], 'AF=0.2,0.3,0.1;KKKK',
... 'GT:DP:PL', ['1/2:81:281,5,9,58,0,115,338,46,116,809',
... '0/0:86:0,30,323,31,365,483,38,291,325,567'])
>>> next(n)
('TA', 'TAA', 'KKKK', 'GT:PL', ['1/.:281,5,9', '0/0:0,30,323'])
>>> next(n)
('TA', 'TAAA', 'KKKK', 'GT:PL', ['./1:281,58,115', '0/0:0,31,483'])
>>> next(n)
('TA', 'T', 'KKKK', 'GT:PL', ['./.:281,338,809', '0/0:0,38,567'])
"""
if len(alts) == 1:
yield ref, alts[0], infostr(info, ()), frmt, gts
raise StopIteration

if not isinstance(info, dict):
info = OrderedDict((kv[0], (kv[1] if len(kv) > 1 else None)) for kv in (x.split('=') for x in info.split(';')))

fmts = frmt.split(':')
gts = [OrderedDict(zip(fmts, x.split(':'))) for x in gts]

for i, g in enumerate(gts):
gts[i]['GT'] = g['GT'].split('/') if '/' in g['GT'] else g['GT'].split('|')

for i, alt in enumerate(alts, start=1):
fixed_genos = fix_genos([x for x in gts], i)
fields = fixed_genos[0].keys() # ordered
yield ref, alt, infostr(info), ":".join(fields), fmt_genos(fixed_genos)

def normalize(pos, ref, alt):
"""simplify a ref/alt a la vt normalize so that ref=CC, alt=CCT becomes
ref=C, alt=CT. this helps in annotating variants.
>>> normalize(123, 'T', 'C')
(123, 'T', 'C')
>>> normalize(123, 'CC', 'CCT')
(124, 'C', 'CT')
>>> normalize(123, 'TCCCCT', 'CCCCT')
(123, 'TC', 'C')
>>> normalize(123, 'TCCCCTA', 'CCCCT')
(123, 'TCCCCTA', 'CCCCT')
>>> normalize(123, 'TCCCCTA', 'CCCCTA')
(123, 'TC', 'C')
>>> normalize(123, 'AAATCCCCTA', 'AAACCCCTA')
(125, 'AT', 'A')
>>> normalize(123, 'CAAATCCCCTAG', 'AAACCCCTA')
(123, 'CAAATCCCCTAG', 'AAACCCCTA')
"""
if len(ref) == len(alt) == 1:
return pos, ref, alt

# logic for trimming from either end is the same so we just reverse the
# string to trim the suffix (i == 0) and correct position when doing prefix
# (i == 1).
for i in range(2):
if i == 0: # suffix so flip
ref, alt = ref[::-1], alt[::-1]


n, min_len = 0, min(len(ref), len(alt))
while n + 1 < min_len and alt[n] == ref[n]:
n += 1
alt, ref = alt[n:], ref[n:]
if i == 0: # flip back
ref, alt = ref[::-1], alt[::-1]
else: # add to position since we stripped the prefix
pos += n

return pos, ref, alt

if __name__ == "__main__":
import doctest
doctest.testmod()

0 comments on commit 58f6aca

Please sign in to comment.