From 19cf21d8ea6c11991cb074c230dfbb8f3626a6eb Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Wed, 30 Mar 2022 16:49:51 -0500
Subject: [PATCH 1/5] Consolidate oncokb annotation column headers

---
 AnnotatorCore.py | 106 +++++++++++++++--------------------------------
 1 file changed, 34 insertions(+), 72 deletions(-)

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
index 9784747..bfe1360 100644
--- a/AnnotatorCore.py
+++ b/AnnotatorCore.py
@@ -387,39 +387,12 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
             outf.write("\tIS-A-HOTSPOT")
             outf.write("\tIS-A-3D-HOTSPOT")
             newncols += 2
+            
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
 
-        outf.write("\t" + GENE_IN_ONCOKB_HEADER)
-        outf.write("\t" + VARIANT_IN_ONCOKB_HEADER)
-
-        outf.write("\tMUTATION_EFFECT")
-        outf.write("\tMUTATION_EFFECT_CITATIONS")
-        outf.write("\tONCOGENIC")
-
-        newncols += 5
-
-        for l in levels:
-            outf.write('\t' + l)
-        newncols += len(levels)
-
-        outf.write("\tHIGHEST_LEVEL")
-        outf.write("\tTX_CITATIONS")
-        newncols += 2
-
-        for l in dxLevels:
-            outf.write('\t' + l)
-        newncols += len(dxLevels)
-
-        outf.write("\tHIGHEST_DX_LEVEL")
-        outf.write("\tDX_CITATIONS")
-        newncols += 2
-
-        for l in pxLevels:
-            outf.write('\t' + l)
-        newncols += len(pxLevels)
-
-        outf.write("\tHIGHEST_PX_LEVEL")
-        outf.write("\tPX_CITATIONS")
-        newncols += 2
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers)
+        newncols += len(oncokb_annotation_headers)
 
         outf.write("\n")
 
@@ -451,6 +424,26 @@ def get_cell_content(row, index, return_empty_string=False):
     else:
         return None
 
+
+def get_oncokb_annotation_column_headers():
+    headers = [GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT", "MUTATION_EFFECT_CITATIONS",
+               "ONCOGENIC"]
+    for l in levels:
+        headers.append(l)
+    headers.append("HIGHEST_LEVEL")
+    headers.append("TX_CITATIONS")
+
+    for l in dxLevels:
+        headers.append(l)
+    headers.append("HIGHEST_DX_LEVEL")
+    headers.append("DX_CITATIONS")
+
+    for l in pxLevels:
+        headers.append(l)
+    headers.append("HIGHEST_PX_LEVEL")
+    headers.append("PX_CITATIONS")
+    return headers
+
 def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap,
                        annotatehotspots, default_reference_genome):
     ihugo = geIndexOfHeader(maf_headers, HUGO_HEADERS)
@@ -683,28 +676,12 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
             return
 
         outf.write(headers['^-$'])
-        outf.write("\t" + GENE_IN_ONCOKB_HEADER)
-        outf.write("\t" + VARIANT_IN_ONCOKB_HEADER)
-        outf.write("\tMUTATION_EFFECT")
-        outf.write("\tMUTATION_EFFECT_CITATIONS")
-        outf.write("\tONCOGENIC")
-        for l in levels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_LEVEL")
-        outf.write("\tTX_CITATIONS")
-
-        for l in dxLevels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_DX_LEVEL")
-        outf.write("\tDX_CITATIONS")
-
-        for l in pxLevels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_PX_LEVEL")
-        outf.write("\tPX_CITATIONS")
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers))
         outf.write("\n")
 
-        newcols = ncols + 11 + len(levels) + len(dxLevels) + len(pxLevels)
+        newcols = ncols + lens(oncokb_annotation_headers)
 
         igene1 = geIndexOfHeader(headers, ['GENE1'])
         igene2 = geIndexOfHeader(headers, ['GENE2'])
@@ -793,28 +770,13 @@ def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, c
                     samples))
 
         outf.write('SAMPLE_ID\tCANCER_TYPE\tHUGO_SYMBOL\tALTERATION')
-        outf.write("\t"+GENE_IN_ONCOKB_HEADER)
-        outf.write("\t"+VARIANT_IN_ONCOKB_HEADER)
-        outf.write("\tMUTATION_EFFECT")
-        outf.write("\tMUTATION_EFFECT_CITATIONS")
-        outf.write("\tONCOGENIC")
-        for l in levels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_LEVEL")
-        outf.write("\tTX_CITATIONS")
+        ncols = 4
 
-        for l in dxLevels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_DX_LEVEL")
-        outf.write("\tDX_CITATIONS")
-
-        for l in pxLevels:
-            outf.write('\t' + l)
-        outf.write("\tHIGHEST_PX_LEVEL")
-        outf.write("\tPX_CITATIONS")
-        outf.write("\n")
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers))
 
-        ncols = 15 + len(levels) + len(dxLevels) + len(pxLevels)
+        ncols += len(oncokb_annotation_headers)
 
         i = 0
         rows = []

From 2af6cd36fc7892ba3fdd8b92e6b2a554f8cb6a69 Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Thu, 31 Mar 2022 10:55:57 -0500
Subject: [PATCH 2/5] Support structural variant annotator

---
 AnnotatorCore.py              | 141 +++++++++++++++++++++++++++-------
 FusionAnnotator.py            |   2 +-
 StructuralVariantAnnotator.py |  64 +++++++++++++++
 data/example_sv.txt           |   8 ++
 example.sh                    |   6 +-
 5 files changed, 190 insertions(+), 31 deletions(-)
 create mode 100644 StructuralVariantAnnotator.py
 create mode 100644 data/example_sv.txt

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
index bfe1360..48606e2 100644
--- a/AnnotatorCore.py
+++ b/AnnotatorCore.py
@@ -145,6 +145,13 @@ def setsampleidsfileterfile(f):
 GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
+# columns for structural variant annotation
+SV_GENE_A_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
+SV_GENE_B_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
+SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE']
+SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN']
+
+UNKNOWN = 'UNKNOWN'
 
 class QueryType(Enum):
     HGVSP_SHORT = 'HGVSP_SHORT'
@@ -391,7 +398,7 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
         oncokb_annotation_headers = get_oncokb_annotation_column_headers()
 
         outf.write("\t")
-        outf.write("\t".join(oncokb_annotation_headers)
+        outf.write("\t".join(oncokb_annotation_headers))
         newncols += len(oncokb_annotation_headers)
 
         outf.write("\n")
@@ -649,19 +656,19 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
 def getgenesfromfusion(fusion, nameregex=None):
     GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex
     searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE)
-    gene1=None
-    gene2=None
+    geneA=None
+    geneB=None
     if searchresult:
         parts = searchresult.group(1).split("-")
-        gene1 = parts[0]
-        gene2 = gene1
+        geneA = parts[0]
+        geneB = geneA
         if len(parts) > 1 and parts[1] != "intragenic":
-            gene2 = parts[1]
+            geneB = parts[1]
     else:
-        gene1=gene2=fusion
-    return gene1, gene2
+        geneA=geneB=fusion
+    return geneA, geneB
 
-def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
+def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
     if os.path.isfile(previousoutfile):
         cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
     outf = open(outfile, 'w+')
@@ -681,10 +688,10 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
         outf.write("\t".join(oncokb_annotation_headers))
         outf.write("\n")
 
-        newcols = ncols + lens(oncokb_annotation_headers)
+        newcols = ncols + len(oncokb_annotation_headers)
 
-        igene1 = geIndexOfHeader(headers, ['GENE1'])
-        igene2 = geIndexOfHeader(headers, ['GENE2'])
+        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
         ifusion = geIndexOfHeader(headers, FUSION_HEADERS)
         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
         icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
@@ -704,20 +711,92 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
             if sampleidsfilter and sample not in sampleidsfilter:
                 continue
 
-            gene1 = None
-            gene2 = None
-            if igene1 >= 0:
-                gene1 = row[igene1]
-            if igene2 >= 0:
-                gene2 = row[igene2]
-            if igene1 < 0 and igene2 < 0 and ifusion >= 0:
+            geneA = None
+            geneB = None
+            if igeneA >= 0:
+                geneA = row[igeneA]
+            if igeneB >= 0:
+                geneB = row[igeneB]
+            if igeneA < 0 and igeneB < 0 and ifusion >= 0:
                 fusion = row[ifusion]
-                gene1, gene2 = getgenesfromfusion(fusion, nameregex)
+                geneA, geneB = getgenesfromfusion(fusion, nameregex)
 
             cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
 
 
-            queries.append(StructuralVariantQuery(gene1, gene2, 'FUSION', cancertype))
+            queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype))
+            rows.append(row)
+
+            if len(queries) == POST_QUERIES_THRESHOLD:
+                annotations = pull_structural_variant_info(queries)
+                append_annotation_to_file(outf, newcols, rows, annotations)
+                queries = []
+                rows = []
+
+        if len(queries) > 0:
+            annotations = pull_structural_variant_info(queries)
+            append_annotation_to_file(outf, newcols, rows, annotations)
+    outf.close()
+    
+def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap):
+    if os.path.isfile(previousoutfile):
+        cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
+    outf = open(outfile, 'w+')
+    with open(svdata, 'rU') as infile:
+        reader = csv.reader(infile, delimiter='\t')
+
+        headers = readheaders(reader)
+
+        ncols = headers["length"]
+
+        if ncols == 0:
+            return
+
+        outf.write(headers['^-$'])
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers))
+        outf.write("\n")
+
+        newcols = ncols + len(oncokb_annotation_headers)
+
+        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
+        isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER)
+        isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
+        icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
+
+        i = 0
+        queries = []
+        rows = []
+        for row in reader:
+            i = i + 1
+            if i % POST_QUERIES_THRESHOLD == 0:
+                log.info(i)
+
+            row = padrow(row, ncols)
+
+            sample = row[isample]
+
+            if sampleidsfilter and sample not in sampleidsfilter:
+                continue
+                
+            if igeneA < 0 or igeneB < 0:
+                log.warning("Please specify two genes")
+                continue
+                
+            svtype = None
+            if isvtype >= 0:
+                svtype = row[isvtype].upper()
+                if svtype not in SV_TYPES:
+                    svtype = None
+            if svtype is None:
+                svtype = UNKNOWN
+
+            cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
+            
+            sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype)
+            queries.append(sv_query)
             rows.append(row)
 
             if len(queries) == POST_QUERIES_THRESHOLD:
@@ -868,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             if ncols == 0:
                 return
 
-            igene1 = geIndexOfHeader(headers, ['GENE1'] + HUGO_HEADERS)  # fusion
-            igene2 = geIndexOfHeader(headers, ['GENE2'] + HUGO_HEADERS)  # fusion
+            igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)  # fusion
+            igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)  # fusion
             ifusion = geIndexOfHeader(headers, ['FUSION'])
 
             ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
@@ -882,7 +961,7 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             # imutationeffect = headers['MUTATION_EFFECT']
             ioncogenic = headers['ONCOGENIC']
 
-            isfusion = (igene1 != -1 & igene2 != -1) or ifusion != -1
+            isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1
             ismutorcna = ihugo != -1 & ihgvs != -1
 
             if not isfusion and not ismutorcna:
@@ -919,8 +998,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
 
                 hugo = row[ihugo]
                 alteration = row[ihgvs]
-                gene1 = row[igene1]
-                gene2 = row[igene2]
+                geneA = row[igeneA]
+                geneB = row[igeneB]
 
                 variant = "NA"
                 if ismutorcna:
@@ -929,10 +1008,10 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
                     if ifusion != -1:
                         variant = row[ifusion]
                     else:
-                        if gene1 == gene2:
-                            variant = gene1 + " intragenic deletion"
+                        if geneA == geneB:
+                            variant = geneA + " intragenic deletion"
                         else:
-                            variant = gene1 + "-" + gene2 + " fusion"
+                            variant = geneA + "-" + geneB + " fusion"
 
                 if oncogenic == "oncogenic" or oncogenic == "likely oncogenic" or oncogenic == "predicted oncogenic":
                     sampledrivers[sample].append(variant)
@@ -1381,6 +1460,8 @@ def appendoncokbcitations(citations, pmids, abstracts):
 class Gene:
     def __init__(self, hugo):
         self.hugoSymbol = hugo
+    def __str__(self):
+        return self.hugoSymbol
 
 
 class ProteinChangeQuery:
@@ -1459,6 +1540,8 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):
         self.functionalFusion = is_functional_fusion
         self.structuralVariantType = structural_variant_type.upper()
         self.tumorType = cancertype
+    def __str__(self):
+        return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType])
 
 
 def pull_protein_change_info(queries, annotate_hotspot):
diff --git a/FusionAnnotator.py b/FusionAnnotator.py
index 1f11af2..40c327f 100644
--- a/FusionAnnotator.py
+++ b/FusionAnnotator.py
@@ -40,7 +40,7 @@ def main(argv):
         readCancerTypes(argv.input_clinical_file, cancertypemap)
 
     log.info('annotating %s ...' % argv.input_file)
-    processsv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
+    process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
               cancertypemap, argv.structural_variant_name_format)
 
     log.info('done!')
diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py
new file mode 100644
index 0000000..df2a13b
--- /dev/null
+++ b/StructuralVariantAnnotator.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+import argparse
+from AnnotatorCore import *
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger('StructuralVariantAnnotator')
+
+def main(argv):
+    if argv.help:
+        log.info('\n'
+        'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] [-b <oncokb api bear token>]\n'
+        '  Essential structural variant columns (case insensitive):\n'
+        '    GENE_A: Hugo gene symbol for gene A\n'
+        '    GENE_B: Hugo gene symbol for gene B\n'
+        '    SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
+        '    TUMOR_SAMPLE_BARCODE: sample ID\n'
+        '  Essential clinical columns:\n'
+        '    SAMPLE_ID: sample ID\n'
+        '    ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
+        '  Cancer type will be assigned based on the following priority:\n'
+        '     1) ONCOTREE_CODE in clinical data file\n'
+        '     2) ONCOTREE_CODE exist in structural variant\n'
+        '     3) default tumor type (-t)\n'
+        '  Default OncoKB base url is https://www.oncokb.org')
+        sys.exit()
+    if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
+        log.info('for help: python StructuralVariantAnnotator.py -h')
+        sys.exit(2)
+    if argv.sample_ids_filter:
+        setsampleidsfileterfile(argv.sample_ids_filter)
+    if argv.cancer_hotspots_base_url:
+        setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
+    if argv.oncokb_api_url:
+        setoncokbbaseurl(argv.oncokb_api_url)
+    setoncokbapitoken(argv.oncokb_api_bearer_token)
+
+    cancertypemap = {}
+    if argv.input_clinical_file:
+        readCancerTypes(argv.input_clinical_file, cancertypemap)
+
+    log.info('annotating %s ...' % argv.input_file)
+    process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap)
+
+    log.info('done!')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(add_help=False)
+    # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
+    parser.add_argument('-h', dest='help', action="store_true", default=False)
+    parser.add_argument('-i', dest='input_file', default='', type=str)
+    parser.add_argument('-o', dest='output_file', default='', type=str)
+    parser.add_argument('-p', dest='previous_result_file', default='', type=str)
+    parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
+    parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
+    parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
+    parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
+    parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
+    parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
+    parser.set_defaults(func=main)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/data/example_sv.txt b/data/example_sv.txt
new file mode 100644
index 0000000..526466a
--- /dev/null
+++ b/data/example_sv.txt
@@ -0,0 +1,8 @@
+Tumor_Sample_Barcode	GeneA	GeneB	Sv_Type
+TCGA-02-0033-01	MLL2	MLL2	DELETION
+TCGA-05-4417-01	ALK	EML4	FUSION
+TCGA-06-0155-01	EGFR	EGFR	DELETION
+TCGA-06-0155-01	TMPRSS2	ERG	FUSION
+TCGA-05-4417-01	TMPRSS2	ERG	FUSION
+TCGA-06-0155-01	ERBB2	ERBB2	DELETION
+TCGA-02-0033-01	ETV6	RUNX1	FUSION
diff --git a/example.sh b/example.sh
index fa9379e..a141091 100644
--- a/example.sh
+++ b/example.sh
@@ -16,6 +16,9 @@ OATYPICALALT="data/example_atypical_alterations.oncokb.txt"
 IF="data/example_fusions.txt"
 OF="data/example_fusions.oncokb.txt"
 
+ISV="data/example_sv.txt"
+OSV="data/example_sv.oncokb.txt"
+
 ICNA="data/example_cna.txt"
 OCNA="data/example_cna.oncokb.txt"
 
@@ -38,7 +41,8 @@ python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN"
 python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN"
 
 python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
+python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
 python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
-python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF"
+python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
 python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
 python GenerateReadMe.py -o "$README"

From 1bb64665b0edad42e20eb0e9f1702be14f8c4afe Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Thu, 31 Mar 2022 10:55:57 -0500
Subject: [PATCH 3/5] Support structural variant annotator

---
 AnnotatorCore.py              | 137 +++++++++++++++++++++++++++-------
 FusionAnnotator.py            |   2 +-
 StructuralVariantAnnotator.py |  64 ++++++++++++++++
 data/example_sv.txt           |   8 ++
 example.sh                    |   6 +-
 5 files changed, 188 insertions(+), 29 deletions(-)
 create mode 100644 StructuralVariantAnnotator.py
 create mode 100644 data/example_sv.txt

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
index 8e66d80..9ac3e56 100644
--- a/AnnotatorCore.py
+++ b/AnnotatorCore.py
@@ -145,6 +145,13 @@ def setsampleidsfileterfile(f):
 GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
+# columns for structural variant annotation
+SV_GENE_A_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
+SV_GENE_B_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
+SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE']
+SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN']
+
+UNKNOWN = 'UNKNOWN'
 
 class QueryType(Enum):
     HGVSP_SHORT = 'HGVSP_SHORT'
@@ -649,19 +656,19 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
 def getgenesfromfusion(fusion, nameregex=None):
     GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex
     searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE)
-    gene1=None
-    gene2=None
+    geneA=None
+    geneB=None
     if searchresult:
         parts = searchresult.group(1).split("-")
-        gene1 = parts[0]
-        gene2 = gene1
+        geneA = parts[0]
+        geneB = geneA
         if len(parts) > 1 and parts[1] != "intragenic":
-            gene2 = parts[1]
+            geneB = parts[1]
     else:
-        gene1=gene2=fusion
-    return gene1, gene2
+        geneA=geneB=fusion
+    return geneA, geneB
 
-def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
+def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
     if os.path.isfile(previousoutfile):
         cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
     outf = open(outfile, 'w+')
@@ -683,8 +690,8 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
 
         newcols = ncols + len(oncokb_annotation_headers)
 
-        igene1 = geIndexOfHeader(headers, ['GENE1'])
-        igene2 = geIndexOfHeader(headers, ['GENE2'])
+        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
         ifusion = geIndexOfHeader(headers, FUSION_HEADERS)
         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
         icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
@@ -704,20 +711,92 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
             if sampleidsfilter and sample not in sampleidsfilter:
                 continue
 
-            gene1 = None
-            gene2 = None
-            if igene1 >= 0:
-                gene1 = row[igene1]
-            if igene2 >= 0:
-                gene2 = row[igene2]
-            if igene1 < 0 and igene2 < 0 and ifusion >= 0:
+            geneA = None
+            geneB = None
+            if igeneA >= 0:
+                geneA = row[igeneA]
+            if igeneB >= 0:
+                geneB = row[igeneB]
+            if igeneA < 0 and igeneB < 0 and ifusion >= 0:
                 fusion = row[ifusion]
-                gene1, gene2 = getgenesfromfusion(fusion, nameregex)
+                geneA, geneB = getgenesfromfusion(fusion, nameregex)
 
             cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
 
 
-            queries.append(StructuralVariantQuery(gene1, gene2, 'FUSION', cancertype))
+            queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype))
+            rows.append(row)
+
+            if len(queries) == POST_QUERIES_THRESHOLD:
+                annotations = pull_structural_variant_info(queries)
+                append_annotation_to_file(outf, newcols, rows, annotations)
+                queries = []
+                rows = []
+
+        if len(queries) > 0:
+            annotations = pull_structural_variant_info(queries)
+            append_annotation_to_file(outf, newcols, rows, annotations)
+    outf.close()
+    
+def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap):
+    if os.path.isfile(previousoutfile):
+        cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
+    outf = open(outfile, 'w+')
+    with open(svdata, 'rU') as infile:
+        reader = csv.reader(infile, delimiter='\t')
+
+        headers = readheaders(reader)
+
+        ncols = headers["length"]
+
+        if ncols == 0:
+            return
+
+        outf.write(headers['^-$'])
+        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
+        outf.write("\t")
+        outf.write("\t".join(oncokb_annotation_headers))
+        outf.write("\n")
+
+        newcols = ncols + len(oncokb_annotation_headers)
+
+        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
+        isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER)
+        isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
+        icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
+
+        i = 0
+        queries = []
+        rows = []
+        for row in reader:
+            i = i + 1
+            if i % POST_QUERIES_THRESHOLD == 0:
+                log.info(i)
+
+            row = padrow(row, ncols)
+
+            sample = row[isample]
+
+            if sampleidsfilter and sample not in sampleidsfilter:
+                continue
+                
+            if igeneA < 0 or igeneB < 0:
+                log.warning("Please specify two genes")
+                continue
+                
+            svtype = None
+            if isvtype >= 0:
+                svtype = row[isvtype].upper()
+                if svtype not in SV_TYPES:
+                    svtype = None
+            if svtype is None:
+                svtype = UNKNOWN
+
+            cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
+            
+            sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype)
+            queries.append(sv_query)
             rows.append(row)
 
             if len(queries) == POST_QUERIES_THRESHOLD:
@@ -868,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             if ncols == 0:
                 return
 
-            igene1 = geIndexOfHeader(headers, ['GENE1'] + HUGO_HEADERS)  # fusion
-            igene2 = geIndexOfHeader(headers, ['GENE2'] + HUGO_HEADERS)  # fusion
+            igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)  # fusion
+            igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)  # fusion
             ifusion = geIndexOfHeader(headers, ['FUSION'])
 
             ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
@@ -882,7 +961,7 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             # imutationeffect = headers['MUTATION_EFFECT']
             ioncogenic = headers['ONCOGENIC']
 
-            isfusion = (igene1 != -1 & igene2 != -1) or ifusion != -1
+            isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1
             ismutorcna = ihugo != -1 & ihgvs != -1
 
             if not isfusion and not ismutorcna:
@@ -919,8 +998,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
 
                 hugo = row[ihugo]
                 alteration = row[ihgvs]
-                gene1 = row[igene1]
-                gene2 = row[igene2]
+                geneA = row[igeneA]
+                geneB = row[igeneB]
 
                 variant = "NA"
                 if ismutorcna:
@@ -929,10 +1008,10 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
                     if ifusion != -1:
                         variant = row[ifusion]
                     else:
-                        if gene1 == gene2:
-                            variant = gene1 + " intragenic deletion"
+                        if geneA == geneB:
+                            variant = geneA + " intragenic deletion"
                         else:
-                            variant = gene1 + "-" + gene2 + " fusion"
+                            variant = geneA + "-" + geneB + " fusion"
 
                 if oncogenic == "oncogenic" or oncogenic == "likely oncogenic" or oncogenic == "predicted oncogenic":
                     sampledrivers[sample].append(variant)
@@ -1381,6 +1460,8 @@ def appendoncokbcitations(citations, pmids, abstracts):
 class Gene:
     def __init__(self, hugo):
         self.hugoSymbol = hugo
+    def __str__(self):
+        return self.hugoSymbol
 
 
 class ProteinChangeQuery:
@@ -1459,6 +1540,8 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):
         self.functionalFusion = is_functional_fusion
         self.structuralVariantType = structural_variant_type.upper()
         self.tumorType = cancertype
+    def __str__(self):
+        return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType])
 
 
 def pull_protein_change_info(queries, annotate_hotspot):
diff --git a/FusionAnnotator.py b/FusionAnnotator.py
index 1f11af2..40c327f 100644
--- a/FusionAnnotator.py
+++ b/FusionAnnotator.py
@@ -40,7 +40,7 @@ def main(argv):
         readCancerTypes(argv.input_clinical_file, cancertypemap)
 
     log.info('annotating %s ...' % argv.input_file)
-    processsv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
+    process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
               cancertypemap, argv.structural_variant_name_format)
 
     log.info('done!')
diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py
new file mode 100644
index 0000000..df2a13b
--- /dev/null
+++ b/StructuralVariantAnnotator.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+import argparse
+from AnnotatorCore import *
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger('StructuralVariantAnnotator')
+
+def main(argv):
+    if argv.help:
+        log.info('\n'
+        'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] [-b <oncokb api bear token>]\n'
+        '  Essential structural variant columns (case insensitive):\n'
+        '    GENE_A: Hugo gene symbol for gene A\n'
+        '    GENE_B: Hugo gene symbol for gene B\n'
+        '    SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
+        '    TUMOR_SAMPLE_BARCODE: sample ID\n'
+        '  Essential clinical columns:\n'
+        '    SAMPLE_ID: sample ID\n'
+        '    ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
+        '  Cancer type will be assigned based on the following priority:\n'
+        '     1) ONCOTREE_CODE in clinical data file\n'
+        '     2) ONCOTREE_CODE exist in structural variant\n'
+        '     3) default tumor type (-t)\n'
+        '  Default OncoKB base url is https://www.oncokb.org')
+        sys.exit()
+    if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
+        log.info('for help: python StructuralVariantAnnotator.py -h')
+        sys.exit(2)
+    if argv.sample_ids_filter:
+        setsampleidsfileterfile(argv.sample_ids_filter)
+    if argv.cancer_hotspots_base_url:
+        setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
+    if argv.oncokb_api_url:
+        setoncokbbaseurl(argv.oncokb_api_url)
+    setoncokbapitoken(argv.oncokb_api_bearer_token)
+
+    cancertypemap = {}
+    if argv.input_clinical_file:
+        readCancerTypes(argv.input_clinical_file, cancertypemap)
+
+    log.info('annotating %s ...' % argv.input_file)
+    process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap)
+
+    log.info('done!')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(add_help=False)
+    # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
+    parser.add_argument('-h', dest='help', action="store_true", default=False)
+    parser.add_argument('-i', dest='input_file', default='', type=str)
+    parser.add_argument('-o', dest='output_file', default='', type=str)
+    parser.add_argument('-p', dest='previous_result_file', default='', type=str)
+    parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
+    parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
+    parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
+    parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
+    parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
+    parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
+    parser.set_defaults(func=main)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/data/example_sv.txt b/data/example_sv.txt
new file mode 100644
index 0000000..526466a
--- /dev/null
+++ b/data/example_sv.txt
@@ -0,0 +1,8 @@
+Tumor_Sample_Barcode	GeneA	GeneB	Sv_Type
+TCGA-02-0033-01	MLL2	MLL2	DELETION
+TCGA-05-4417-01	ALK	EML4	FUSION
+TCGA-06-0155-01	EGFR	EGFR	DELETION
+TCGA-06-0155-01	TMPRSS2	ERG	FUSION
+TCGA-05-4417-01	TMPRSS2	ERG	FUSION
+TCGA-06-0155-01	ERBB2	ERBB2	DELETION
+TCGA-02-0033-01	ETV6	RUNX1	FUSION
diff --git a/example.sh b/example.sh
index fa9379e..a141091 100644
--- a/example.sh
+++ b/example.sh
@@ -16,6 +16,9 @@ OATYPICALALT="data/example_atypical_alterations.oncokb.txt"
 IF="data/example_fusions.txt"
 OF="data/example_fusions.oncokb.txt"
 
+ISV="data/example_sv.txt"
+OSV="data/example_sv.oncokb.txt"
+
 ICNA="data/example_cna.txt"
 OCNA="data/example_cna.oncokb.txt"
 
@@ -38,7 +41,8 @@ python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN"
 python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN"
 
 python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
+python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
 python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
-python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF"
+python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
 python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
 python GenerateReadMe.py -o "$README"

From 2ad6fbb12d84ead3e99e3ff2c9f1855b361f7298 Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Tue, 5 Apr 2022 15:57:44 -0500
Subject: [PATCH 4/5] Update README.md

---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index feb68da..d2fcf07 100644
--- a/README.md
+++ b/README.md
@@ -38,12 +38,21 @@ We use GISTIC 2.0 format. For more information, please see https://docs.cbioport
 Get more details on the command line using `python CnaAnnotator.py -h`.  
 
 ### Fusion
-OncoKB offers to anntoate the strucutal variant. But in annotator, we only annotate the functional fusion.
+OncoKB offers to annotate functional fusions.
 The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`.
 For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`.  
 
 Get more details on the command line using `python FusionAnnotator.py -h`.  
 
+### Structural Variant
+OncoKB offers to annotate structural variant.
+The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN.
+All other types will be converted to UNKNOWN.
+
+All structural variants with two different gene partners, they will be considered as functional fusions.
+
+Get more details on the command line using `python StructuralVariantAnnotator.py -h`.
+
 ### Clinical Data (Combine MAF+CNA+Fusion)
 You can comebine all annotation on sample/patient level using the clinical data annotator.  
 

From 7719642dbe1eb6a372ef7f8fe80e20703b19351a Mon Sep 17 00:00:00 2001
From: Hongxin <jackson.zhang.828@gmail.com>
Date: Tue, 5 Apr 2022 16:08:31 -0500
Subject: [PATCH 5/5] update

---
 AnnotatorCore.py              | 16 ++++++++--------
 StructuralVariantAnnotator.py |  4 ++--
 test_Annotation.py            |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
index 9ac3e56..411b14e 100644
--- a/AnnotatorCore.py
+++ b/AnnotatorCore.py
@@ -146,8 +146,8 @@ def setsampleidsfileterfile(f):
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
 # columns for structural variant annotation
-SV_GENE_A_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
-SV_GENE_B_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
+SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
+SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
 SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE']
 SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN']
 
@@ -690,8 +690,8 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy
 
         newcols = ncols + len(oncokb_annotation_headers)
 
-        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
-        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
+        igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
         ifusion = geIndexOfHeader(headers, FUSION_HEADERS)
         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
         icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
@@ -760,8 +760,8 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa
 
         newcols = ncols + len(oncokb_annotation_headers)
 
-        igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)
-        igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)
+        igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
+        igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
         isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER)
         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
         icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
@@ -947,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
             if ncols == 0:
                 return
 
-            igeneA = geIndexOfHeader(headers, SV_GENE_A_HEADER)  # fusion
-            igeneB = geIndexOfHeader(headers, SV_GENE_B_HEADER)  # fusion
+            igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)  # fusion
+            igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)  # fusion
             ifusion = geIndexOfHeader(headers, ['FUSION'])
 
             ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py
index df2a13b..f7edfca 100644
--- a/StructuralVariantAnnotator.py
+++ b/StructuralVariantAnnotator.py
@@ -11,8 +11,8 @@ def main(argv):
         log.info('\n'
         'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] [-b <oncokb api bear token>]\n'
         '  Essential structural variant columns (case insensitive):\n'
-        '    GENE_A: Hugo gene symbol for gene A\n'
-        '    GENE_B: Hugo gene symbol for gene B\n'
+        '    GENEA: Hugo gene symbol for gene A\n'
+        '    GENEB: Hugo gene symbol for gene B\n'
         '    SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
         '    TUMOR_SAMPLE_BARCODE: sample ID\n'
         '  Essential clinical columns:\n'
diff --git a/test_Annotation.py b/test_Annotation.py
index 21a4953..22d6a8f 100644
--- a/test_Annotation.py
+++ b/test_Annotation.py
@@ -193,7 +193,7 @@ def test_check_genomic_change():
     assert annotation[HIGHEST_LEVEL_INDEX] == ''
 
 @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
-def test_check_fusions():
+def test_check_structural_variants():
     queries = [
         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'),
         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'),