From f22fe53af5d3aba0420d1a50297b3bc5cb7808dc Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Tue, 5 Apr 2022 17:10:06 -0400
Subject: [PATCH] Support structural variant annotator (#161)

---
 AnnotatorCore.py              | 137 +++++++++++++++++++++++++++-------
 FusionAnnotator.py            |   2 +-
 README.md                     |  11 ++-
 StructuralVariantAnnotator.py |  64 ++++++++++++++++
 data/example_sv.txt           |   8 ++
 example.sh                    |   6 +-
 test_Annotation.py            |   2 +-
 7 files changed, 199 insertions(+), 31 deletions(-)
 create mode 100644 StructuralVariantAnnotator.py
 create mode 100644 data/example_sv.txt

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
index 8e66d80..411b14e 100644
--- a/AnnotatorCore.py
+++ b/AnnotatorCore.py
@@ -145,6 +145,13 @@ def setsampleidsfileterfile(f):
 GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
+# columns for structural variant annotation
+SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
+SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
+SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE']
+SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN']
+
+UNKNOWN = 'UNKNOWN'
 
 class QueryType(Enum):
     HGVSP_SHORT = 'HGVSP_SHORT'
@@ -649,19 +656,19 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
 def getgenesfromfusion(fusion, nameregex=None):
     GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex
     searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE)
-    gene1=None
-    gene2=None
+    geneA=None
+    geneB=None
     if searchresult:
         parts = searchresult.group(1).split("-")
-        gene1 = parts[0]
-        gene2 = gene1
+        geneA = parts[0]
+        geneB = geneA
         if len(parts) > 1 and parts[1] != "intragenic":
-            gene2 = parts[1]
+            geneB = parts[1]
     else:
-        gene1=gene2=fusion
-    return gene1, gene2
+        geneA=geneB=fusion
+    return geneA, geneB
 
-def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
+def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
     if os.path.isfile(previousoutfile):
         cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
     outf = open(outfile, 'w+')
@@ -683,8 +690,8 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
 
         newcols = ncols + len(oncokb_annotation_headers)
 
-        igene1 = geIndexOfHeader(headers, ['GENE1'])
-        igene2 = geIndexOfHeader(headers, ['GENE2'])
+        igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
         ifusion = geIndexOfHeader(headers, FUSION_HEADERS)
         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
         icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
@@ -704,20 +711,92 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
             if sampleidsfilter and sample not in sampleidsfilter:
                 continue
 
-            gene1 = None
-            gene2 = None
-            if igene1 >= 0:
-                gene1 = row[igene1]
-            if igene2 >= 0:
-                gene2 = row[igene2]
-            if igene1 < 0 and igene2 < 0 and ifusion >= 0:
+            geneA = None
+            geneB = None
+            if igeneA >= 0:
+                geneA = row[igeneA]
+            if igeneB >= 0:
+                geneB = row[igeneB]
+            if igeneA < 0 and igeneB < 0 and ifusion >= 0:
                 fusion = row[ifusion]
-                gene1, gene2 = getgenesfromfusion(fusion, nameregex)
+                geneA, geneB = getgenesfromfusion(fusion, nameregex)
 
             cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
 
 
-            queries.append(StructuralVariantQuery(gene1, gene2, 'FUSION', cancertype))
+            queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype))
+            rows.append(row)
+
+            if len(queries) == POST_QUERIES_THRESHOLD:
+                annotations = pull_structural_variant_info(queries)
+                append_annotation_to_file(outf, newcols, rows, annotations)
+                queries = []
+                rows = []
+
+        if len(queries) > 0:
+            annotations = pull_structural_variant_info(queries)
+            append_annotation_to_file(outf, newcols, rows, annotations)
+    outf.close()
+    
+def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap):
+    if os.path.isfile(previousoutfile):
+        cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
+    outf = open(outfile, 'w+')
+    with open(svdata, 'rU') as infile:
+        reader = csv.reader(infile, delimiter='\t')
+
+        headers = readheaders(reader)
+
+        ncols = headers["length"]
+
+        if ncols == 0:
+            return
+
+        outf.write(headers['^-$'])
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers))
+        outf.write("\n")
+
+        newcols = ncols + len(oncokb_annotation_headers)
+
+        igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
+        isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER)
+        isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
+        icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
+
+        i = 0
+        queries = []
+        rows = []
+        for row in reader:
+            i = i + 1
+            if i % POST_QUERIES_THRESHOLD == 0:
+                log.info(i)
+
+            row = padrow(row, ncols)
+
+            sample = row[isample]
+
+            if sampleidsfilter and sample not in sampleidsfilter:
+                continue
+                
+            if igeneA < 0 or igeneB < 0:
+                log.warning("Please specify two genes")
+                continue
+                
+            svtype = None
+            if isvtype >= 0:
+                svtype = row[isvtype].upper()
+                if svtype not in SV_TYPES:
+                    svtype = None
+            if svtype is None:
+                svtype = UNKNOWN
+
+            cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
+            
+            sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype)
+            queries.append(sv_query)
             rows.append(row)
 
             if len(queries) == POST_QUERIES_THRESHOLD:
@@ -868,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             if ncols == 0:
                 return
 
-            igene1 = geIndexOfHeader(headers, ['GENE1'] + HUGO_HEADERS)  # fusion
-            igene2 = geIndexOfHeader(headers, ['GENE2'] + HUGO_HEADERS)  # fusion
+            igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)  # fusion
+            igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)  # fusion
             ifusion = geIndexOfHeader(headers, ['FUSION'])
 
             ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
@@ -882,7 +961,7 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             # imutationeffect = headers['MUTATION_EFFECT']
             ioncogenic = headers['ONCOGENIC']
 
-            isfusion = (igene1 != -1 & igene2 != -1) or ifusion != -1
+            isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1
             ismutorcna = ihugo != -1 & ihgvs != -1
 
             if not isfusion and not ismutorcna:
@@ -919,8 +998,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
 
                 hugo = row[ihugo]
                 alteration = row[ihgvs]
-                gene1 = row[igene1]
-                gene2 = row[igene2]
+                geneA = row[igeneA]
+                geneB = row[igeneB]
 
                 variant = "NA"
                 if ismutorcna:
@@ -929,10 +1008,10 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
                     if ifusion != -1:
                         variant = row[ifusion]
                     else:
-                        if gene1 == gene2:
-                            variant = gene1 + " intragenic deletion"
+                        if geneA == geneB:
+                            variant = geneA + " intragenic deletion"
                         else:
-                            variant = gene1 + "-" + gene2 + " fusion"
+                            variant = geneA + "-" + geneB + " fusion"
 
                 if oncogenic == "oncogenic" or oncogenic == "likely oncogenic" or oncogenic == "predicted oncogenic":
                     sampledrivers[sample].append(variant)
@@ -1381,6 +1460,8 @@ def appendoncokbcitations(citations, pmids, abstracts):
 class Gene:
     def __init__(self, hugo):
         self.hugoSymbol = hugo
+    def __str__(self):
+        return self.hugoSymbol
 
 
 class ProteinChangeQuery:
@@ -1459,6 +1540,8 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):
         self.functionalFusion = is_functional_fusion
         self.structuralVariantType = structural_variant_type.upper()
         self.tumorType = cancertype
+    def __str__(self):
+        return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType])
 
 
 def pull_protein_change_info(queries, annotate_hotspot):
diff --git a/FusionAnnotator.py b/FusionAnnotator.py
index 1f11af2..40c327f 100644
--- a/FusionAnnotator.py
+++ b/FusionAnnotator.py
@@ -40,7 +40,7 @@ def main(argv):
         readCancerTypes(argv.input_clinical_file, cancertypemap)
 
     log.info('annotating %s ...' % argv.input_file)
-    processsv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
+    process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
               cancertypemap, argv.structural_variant_name_format)
 
     log.info('done!')
diff --git a/README.md b/README.md
index feb68da..d2fcf07 100644
--- a/README.md
+++ b/README.md
@@ -38,12 +38,21 @@ We use GISTIC 2.0 format. For more information, please see https://docs.cbioport
 Get more details on the command line using `python CnaAnnotator.py -h`.  
 
 ### Fusion
-OncoKB offers to anntoate the strucutal variant. But in annotator, we only annotate the functional fusion.
+OncoKB offers to annotate functional fusions.
 The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`.
 For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`.  
 
 Get more details on the command line using `python FusionAnnotator.py -h`.  
 
+### Structural Variant
+OncoKB offers to annotate structural variant.
+The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN.
+All other types will be converted to UNKNOWN.
+
+All structural variants with two different gene partners, they will be considered as functional fusions.
+
+Get more details on the command line using `python StructuralVariantAnnotator.py -h`.
+
 ### Clinical Data (Combine MAF+CNA+Fusion)
 You can comebine all annotation on sample/patient level using the clinical data annotator.  
 
diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py
new file mode 100644
index 0000000..f7edfca
--- /dev/null
+++ b/StructuralVariantAnnotator.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+import argparse
+from AnnotatorCore import *
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger('StructuralVariantAnnotator')
+
+def main(argv):
+    if argv.help:
+        log.info('\n'
+        'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] [-b <oncokb api bear token>]\n'
+        '  Essential structural variant columns (case insensitive):\n'
+        '    GENEA: Hugo gene symbol for gene A\n'
+        '    GENEB: Hugo gene symbol for gene B\n'
+        '    SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
+        '    TUMOR_SAMPLE_BARCODE: sample ID\n'
+        '  Essential clinical columns:\n'
+        '    SAMPLE_ID: sample ID\n'
+        '    ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
+        '  Cancer type will be assigned based on the following priority:\n'
+        '     1) ONCOTREE_CODE in clinical data file\n'
+        '     2) ONCOTREE_CODE exist in structural variant\n'
+        '     3) default tumor type (-t)\n'
+        '  Default OncoKB base url is https://www.oncokb.org')
+        sys.exit()
+    if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
+        log.info('for help: python StructuralVariantAnnotator.py -h')
+        sys.exit(2)
+    if argv.sample_ids_filter:
+        setsampleidsfileterfile(argv.sample_ids_filter)
+    if argv.cancer_hotspots_base_url:
+        setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
+    if argv.oncokb_api_url:
+        setoncokbbaseurl(argv.oncokb_api_url)
+    setoncokbapitoken(argv.oncokb_api_bearer_token)
+
+    cancertypemap = {}
+    if argv.input_clinical_file:
+        readCancerTypes(argv.input_clinical_file, cancertypemap)
+
+    log.info('annotating %s ...' % argv.input_file)
+    process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap)
+
+    log.info('done!')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(add_help=False)
+    # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
+    parser.add_argument('-h', dest='help', action="store_true", default=False)
+    parser.add_argument('-i', dest='input_file', default='', type=str)
+    parser.add_argument('-o', dest='output_file', default='', type=str)
+    parser.add_argument('-p', dest='previous_result_file', default='', type=str)
+    parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
+    parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
+    parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
+    parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
+    parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
+    parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
+    parser.set_defaults(func=main)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/data/example_sv.txt b/data/example_sv.txt
new file mode 100644
index 0000000..526466a
--- /dev/null
+++ b/data/example_sv.txt
@@ -0,0 +1,8 @@
+Tumor_Sample_Barcode	GeneA	GeneB	Sv_Type
+TCGA-02-0033-01	MLL2	MLL2	DELETION
+TCGA-05-4417-01	ALK	EML4	FUSION
+TCGA-06-0155-01	EGFR	EGFR	DELETION
+TCGA-06-0155-01	TMPRSS2	ERG	FUSION
+TCGA-05-4417-01	TMPRSS2	ERG	FUSION
+TCGA-06-0155-01	ERBB2	ERBB2	DELETION
+TCGA-02-0033-01	ETV6	RUNX1	FUSION
diff --git a/example.sh b/example.sh
index fa9379e..a141091 100644
--- a/example.sh
+++ b/example.sh
@@ -16,6 +16,9 @@ OATYPICALALT="data/example_atypical_alterations.oncokb.txt"
 IF="data/example_fusions.txt"
 OF="data/example_fusions.oncokb.txt"
 
+ISV="data/example_sv.txt"
+OSV="data/example_sv.oncokb.txt"
+
 ICNA="data/example_cna.txt"
 OCNA="data/example_cna.oncokb.txt"
 
@@ -38,7 +41,8 @@ python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN"
 python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN"
 
 python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
+python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
 python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
-python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF"
+python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
 python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
 python GenerateReadMe.py -o "$README"
diff --git a/test_Annotation.py b/test_Annotation.py
index 21a4953..22d6a8f 100644
--- a/test_Annotation.py
+++ b/test_Annotation.py
@@ -193,7 +193,7 @@ def test_check_genomic_change():
     assert annotation[HIGHEST_LEVEL_INDEX] == ''
 
 @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
-def test_check_fusions():
+def test_check_structural_variants():
     queries = [
         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'),
         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'),