utility code to support multiple alts (by splitting) and simplifying …

…ref/alt representation. see arq5x#193 and arq5x#161.
brentp · Mar 5, 2015 · 58f6aca · 58f6aca
1 parent c63e1ad
commit 58f6aca
Showing 1 changed file with 165 additions and 0 deletions.
diff --git a/gemini/vcfutils.py b/gemini/vcfutils.py
@@ -0,0 +1,165 @@
+"""
+This implements:
+1. allele splitting: ref=A, alt=AT,AAT,ACT becomes 3 separate variants.
+Adjusts PL (or GL) and removes AF and DP fields as needed.
+
+2. simplify variants: pos=123, ref=AAT, alt=AT becomes pos=124, ref=AT, alt=T
+
+"""
+
+from collections import OrderedDict
+from os.path import commonprefix
+import sys
+
+INFO_EXCLUDE = ('AF',)
+GT_EXCLUDE = ('DP',)
+
+def infostr(infodict, exclude=INFO_EXCLUDE):
+    if isinstance(infodict, basestring):
+        return infodict
+    s = ";".join(k + (('=' + v) if v is not None else '') for
+                    k, v in infodict.iteritems() if not k in exclude)
+    return s
+
+def fix_genos(genos, alt_idx, exclude=GT_EXCLUDE):
+    """
+    >>> genos = [{'GT': ['1', '2'], 'PL': '281,5,9,58,0,115,338,46,116,809', 'DP': '81'},
+    ...          {'GT': ['0', '0'], 'PL': '0,30,323,31,365,483,38,291,325,567', 'DP': '86'}]
+
+    >>> fix_genos(genos, 3)
+    [{'GT': ['.', '.'], 'PL': '281,338,809'}, {'GT': ['0', '0'], 'PL': '0,38,567'}]
+
+    >>> fix_genos(genos, 2)
+    [{'GT': ['.', '1'], 'PL': '281,58,115'}, {'GT': ['0', '0'], 'PL': '0,31,483'}]
+
+    >>> fix_genos(genos, 1)
+    [{'GT': ['1', '.'], 'PL': '281,5,9'}, {'GT': ['0', '0'], 'PL': '0,30,323'}]
+    """
+    assert int(alt_idx) > 0
+    # this copy needed since we are modifying in place.
+    genos = [x.copy() for x in genos]
+
+    alt_idx = str(alt_idx)
+    for n, geno in enumerate(genos):
+        # this copy *also* needed since we are modifying in place.
+        geno['GT'] = geno['GT'][:]
+        for ex in exclude:
+            geno.pop(ex, None)
+
+        # pull out only the appropriate likelihoods.
+        # according to VCF spec, these will be in the order:
+        # 0/0, 0/1, 1/1, 0/2, 1/2, 2/2
+        # ordering given by: Order(j/k) = (k*(k+1)/2)+j.
+
+        a, b = 0, int(alt_idx)
+        for key in ('PL', 'GL'):
+            if not key in geno: continue
+
+            vals = geno[key].split(",")
+            # so do this for (a, a), (a, b), (b, b)
+            likelihood = []
+            for j, k in ((a, a), (a, b), (b, b)):
+                order = (k * (k + 1) / 2) + j
+                #print >>sys.stderr, a, b, (j, k), 'order:' + str(order), vals#, 'value:' + str(vals[order])
+                likelihood.append(vals[order])
+            geno[key] = ",".join(likelihood)
+
+        # set the current allele to 1, keep 0 at 0 and set all others to '.'
+        for i, allele in enumerate(geno['GT']):
+            if allele == alt_idx:
+                geno['GT'][i] = '1'
+            elif allele != '0':
+                geno['GT'][i] = '.'
+    return genos
+
+def fmt_genos(genos):
+    for g in genos:
+        g['GT'] = "/".join(g['GT'])
+    vals = [":".join(f.values()) for f in genos]
+    return vals
+
+
+def varsplit(ref, alts, info, frmt, gts):
+    """
+    # from Vt decompose example:
+    >>> n = varsplit('TA', ['TAA', 'TAAA', 'T'], 'AF=0.2,0.3,0.1;KKKK',
+    ...              'GT:DP:PL', ['1/2:81:281,5,9,58,0,115,338,46,116,809',
+    ...              '0/0:86:0,30,323,31,365,483,38,291,325,567'])
+    >>> next(n)
+    ('TA', 'TAA', 'KKKK', 'GT:PL', ['1/.:281,5,9', '0/0:0,30,323'])
+
+    >>> next(n)
+    ('TA', 'TAAA', 'KKKK', 'GT:PL', ['./1:281,58,115', '0/0:0,31,483'])
+
+    >>> next(n)
+    ('TA', 'T', 'KKKK', 'GT:PL', ['./.:281,338,809', '0/0:0,38,567'])
+
+
+    """
+    if len(alts) == 1:
+        yield ref, alts[0], infostr(info, ()), frmt, gts
+        raise StopIteration
+
+    if not isinstance(info, dict):
+        info = OrderedDict((kv[0], (kv[1] if len(kv) > 1 else None)) for kv in (x.split('=') for x in info.split(';')))
+
+    fmts = frmt.split(':')
+    gts = [OrderedDict(zip(fmts, x.split(':'))) for x in gts]
+
+    for i, g in enumerate(gts):
+        gts[i]['GT'] = g['GT'].split('/') if '/' in g['GT'] else g['GT'].split('|')
+
+    for i, alt in enumerate(alts, start=1):
+        fixed_genos = fix_genos([x for x in gts], i)
+        fields = fixed_genos[0].keys() # ordered
+        yield ref, alt, infostr(info), ":".join(fields), fmt_genos(fixed_genos)
+
+def normalize(pos, ref, alt):
+    """simplify a ref/alt a la vt normalize so that ref=CC, alt=CCT becomes
+    ref=C, alt=CT. this helps in annotating variants.
+    >>> normalize(123, 'T', 'C')
+    (123, 'T', 'C')
+
+    >>> normalize(123, 'CC', 'CCT')
+    (124, 'C', 'CT')
+
+    >>> normalize(123, 'TCCCCT', 'CCCCT')
+    (123, 'TC', 'C')
+
+    >>> normalize(123, 'TCCCCTA', 'CCCCT')
+    (123, 'TCCCCTA', 'CCCCT')
+
+    >>> normalize(123, 'TCCCCTA', 'CCCCTA')
+    (123, 'TC', 'C')
+
+    >>> normalize(123, 'AAATCCCCTA', 'AAACCCCTA')
+    (125, 'AT', 'A')
+
+    >>> normalize(123, 'CAAATCCCCTAG', 'AAACCCCTA')
+    (123, 'CAAATCCCCTAG', 'AAACCCCTA')
+    """
+    if len(ref) == len(alt) == 1:
+        return pos, ref, alt
+
+    # logic for trimming from either end is the same so we just reverse the
+    # string to trim the suffix (i == 0) and correct position when doing prefix
+    # (i == 1).
+    for i in range(2):
+        if i == 0: # suffix so flip
+            ref, alt = ref[::-1], alt[::-1]
+
+
+        n, min_len = 0, min(len(ref), len(alt))
+        while n + 1 < min_len and alt[n] == ref[n]:
+            n += 1
+        alt, ref = alt[n:], ref[n:]
+        if i == 0: # flip back
+            ref, alt = ref[::-1], alt[::-1]
+        else: # add to position since we stripped the prefix
+            pos += n
+
+    return pos, ref, alt
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()