From f22fe53af5d3aba0420d1a50297b3bc5cb7808dc Mon Sep 17 00:00:00 2001 From: Hongxin Date: Tue, 5 Apr 2022 17:10:06 -0400 Subject: [PATCH] Support structural variant annotator (#161) --- AnnotatorCore.py | 137 +++++++++++++++++++++++++++------- FusionAnnotator.py | 2 +- README.md | 11 ++- StructuralVariantAnnotator.py | 64 ++++++++++++++++ data/example_sv.txt | 8 ++ example.sh | 6 +- test_Annotation.py | 2 +- 7 files changed, 199 insertions(+), 31 deletions(-) create mode 100644 StructuralVariantAnnotator.py create mode 100644 data/example_sv.txt diff --git a/AnnotatorCore.py b/AnnotatorCore.py index 8e66d80..411b14e 100644 --- a/AnnotatorCore.py +++ b/AnnotatorCore.py @@ -145,6 +145,13 @@ def setsampleidsfileterfile(f): GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2' GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER] +# columns for structural variant annotation +SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1'] +SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2'] +SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE'] +SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN'] + +UNKNOWN = 'UNKNOWN' class QueryType(Enum): HGVSP_SHORT = 'HGVSP_SHORT' @@ -649,19 +656,19 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol def getgenesfromfusion(fusion, nameregex=None): GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE) - gene1=None - gene2=None + geneA=None + geneB=None if searchresult: parts = searchresult.group(1).split("-") - gene1 = parts[0] - gene2 = gene1 + geneA = parts[0] + geneB = geneA if len(parts) > 1 and parts[1] != "intragenic": - gene2 = parts[1] + geneB = parts[1] else: - gene1=gene2=fusion - return gene1, gene2 + geneA=geneB=fusion + return geneA, geneB -def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex): +def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) outf = open(outfile, 'w+') @@ -683,8 +690,8 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap newcols = ncols + len(oncokb_annotation_headers) - igene1 = geIndexOfHeader(headers, ['GENE1']) - igene2 = geIndexOfHeader(headers, ['GENE2']) + igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER) + igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER) ifusion = geIndexOfHeader(headers, FUSION_HEADERS) isample = geIndexOfHeader(headers, SAMPLE_HEADERS) icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS) @@ -704,20 +711,92 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap if sampleidsfilter and sample not in sampleidsfilter: continue - gene1 = None - gene2 = None - if igene1 >= 0: - gene1 = row[igene1] - if igene2 >= 0: - gene2 = row[igene2] - if igene1 < 0 and igene2 < 0 and ifusion >= 0: + geneA = None + geneB = None + if igeneA >= 0: + geneA = row[igeneA] + if igeneB >= 0: + geneB = row[igeneB] + if igeneA < 0 and igeneB < 0 and ifusion >= 0: fusion = row[ifusion] - gene1, gene2 = getgenesfromfusion(fusion, nameregex) + geneA, geneB = getgenesfromfusion(fusion, nameregex) cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - queries.append(StructuralVariantQuery(gene1, gene2, 'FUSION', cancertype)) + queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype)) + rows.append(row) + + if len(queries) == POST_QUERIES_THRESHOLD: + annotations = pull_structural_variant_info(queries) + append_annotation_to_file(outf, newcols, rows, annotations) + queries = [] + rows = [] + + if len(queries) > 0: + annotations = pull_structural_variant_info(queries) + append_annotation_to_file(outf, newcols, rows, annotations) + outf.close() + +def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap): + if os.path.isfile(previousoutfile): + cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) + outf = open(outfile, 'w+') + with open(svdata, 'rU') as infile: + reader = csv.reader(infile, delimiter='\t') + + headers = readheaders(reader) + + ncols = headers["length"] + + if ncols == 0: + return + + outf.write(headers['^-$']) + oncokb_annotation_headers = get_oncokb_annotation_column_headers() + outf.write("\t") + outf.write("\t".join(oncokb_annotation_headers)) + outf.write("\n") + + newcols = ncols + len(oncokb_annotation_headers) + + igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER) + igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER) + isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER) + isample = geIndexOfHeader(headers, SAMPLE_HEADERS) + icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS) + + i = 0 + queries = [] + rows = [] + for row in reader: + i = i + 1 + if i % POST_QUERIES_THRESHOLD == 0: + log.info(i) + + row = padrow(row, ncols) + + sample = row[isample] + + if sampleidsfilter and sample not in sampleidsfilter: + continue + + if igeneA < 0 or igeneB < 0: + log.warning("Please specify two genes") + continue + + svtype = None + if isvtype >= 0: + svtype = row[isvtype].upper() + if svtype not in SV_TYPES: + svtype = None + if svtype is None: + svtype = UNKNOWN + + cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) + + sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype) + queries.append(sv_query) rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD: @@ -868,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile): if ncols == 0: return - igene1 = geIndexOfHeader(headers, ['GENE1'] + HUGO_HEADERS) # fusion - igene2 = geIndexOfHeader(headers, ['GENE2'] + HUGO_HEADERS) # fusion + igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER) # fusion + igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER) # fusion ifusion = geIndexOfHeader(headers, ['FUSION']) ihugo = geIndexOfHeader(headers, HUGO_HEADERS) @@ -882,7 +961,7 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile): # imutationeffect = headers['MUTATION_EFFECT'] ioncogenic = headers['ONCOGENIC'] - isfusion = (igene1 != -1 & igene2 != -1) or ifusion != -1 + isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1 ismutorcna = ihugo != -1 & ihgvs != -1 if not isfusion and not ismutorcna: @@ -919,8 +998,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile): hugo = row[ihugo] alteration = row[ihgvs] - gene1 = row[igene1] - gene2 = row[igene2] + geneA = row[igeneA] + geneB = row[igeneB] variant = "NA" if ismutorcna: @@ -929,10 +1008,10 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile): if ifusion != -1: variant = row[ifusion] else: - if gene1 == gene2: - variant = gene1 + " intragenic deletion" + if geneA == geneB: + variant = geneA + " intragenic deletion" else: - variant = gene1 + "-" + gene2 + " fusion" + variant = geneA + "-" + geneB + " fusion" if oncogenic == "oncogenic" or oncogenic == "likely oncogenic" or oncogenic == "predicted oncogenic": sampledrivers[sample].append(variant) @@ -1381,6 +1460,8 @@ def appendoncokbcitations(citations, pmids, abstracts): class Gene: def __init__(self, hugo): self.hugoSymbol = hugo + def __str__(self): + return self.hugoSymbol class ProteinChangeQuery: @@ -1459,6 +1540,8 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype): self.functionalFusion = is_functional_fusion self.structuralVariantType = structural_variant_type.upper() self.tumorType = cancertype + def __str__(self): + return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType]) def pull_protein_change_info(queries, annotate_hotspot): diff --git a/FusionAnnotator.py b/FusionAnnotator.py index 1f11af2..40c327f 100644 --- a/FusionAnnotator.py +++ b/FusionAnnotator.py @@ -40,7 +40,7 @@ def main(argv): readCancerTypes(argv.input_clinical_file, cancertypemap) log.info('annotating %s ...' % argv.input_file) - processsv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, + process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format) log.info('done!') diff --git a/README.md b/README.md index feb68da..d2fcf07 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,21 @@ We use GISTIC 2.0 format. For more information, please see https://docs.cbioport Get more details on the command line using `python CnaAnnotator.py -h`. ### Fusion -OncoKB offers to anntoate the strucutal variant. But in annotator, we only annotate the functional fusion. +OncoKB offers to annotate functional fusions. The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`. For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`. Get more details on the command line using `python FusionAnnotator.py -h`. +### Structural Variant +OncoKB offers to annotate structural variant. +The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. +All other types will be converted to UNKNOWN. + +All structural variants with two different gene partners, they will be considered as functional fusions. + +Get more details on the command line using `python StructuralVariantAnnotator.py -h`. + ### Clinical Data (Combine MAF+CNA+Fusion) You can comebine all annotation on sample/patient level using the clinical data annotator. diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py new file mode 100644 index 0000000..f7edfca --- /dev/null +++ b/StructuralVariantAnnotator.py @@ -0,0 +1,64 @@ +#!/usr/bin/python + +import argparse +from AnnotatorCore import * +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger('StructuralVariantAnnotator') + +def main(argv): + if argv.help: + log.info('\n' + 'StructuralVariantAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ]\n' + ' Essential structural variant columns (case insensitive):\n' + ' GENEA: Hugo gene symbol for gene A\n' + ' GENEB: Hugo gene symbol for gene B\n' + ' SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n' + ' TUMOR_SAMPLE_BARCODE: sample ID\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID\n' + ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' + ' Cancer type will be assigned based on the following priority:\n' + ' 1) ONCOTREE_CODE in clinical data file\n' + ' 2) ONCOTREE_CODE exist in structural variant\n' + ' 3) default tumor type (-t)\n' + ' Default OncoKB base url is https://www.oncokb.org') + sys.exit() + if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': + log.info('for help: python StructuralVariantAnnotator.py -h') + sys.exit(2) + if argv.sample_ids_filter: + setsampleidsfileterfile(argv.sample_ids_filter) + if argv.cancer_hotspots_base_url: + setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url) + if argv.oncokb_api_url: + setoncokbbaseurl(argv.oncokb_api_url) + setoncokbapitoken(argv.oncokb_api_bearer_token) + + cancertypemap = {} + if argv.input_clinical_file: + readCancerTypes(argv.input_clinical_file, cancertypemap) + + log.info('annotating %s ...' % argv.input_file) + process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap) + + log.info('done!') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(add_help=False) + # ArgumentParser doesn't accept "store_true" and "type=" at the same time. + parser.add_argument('-h', dest='help', action="store_true", default=False) + parser.add_argument('-i', dest='input_file', default='', type=str) + parser.add_argument('-o', dest='output_file', default='', type=str) + parser.add_argument('-p', dest='previous_result_file', default='', type=str) + parser.add_argument('-c', dest='input_clinical_file', default='', type=str) + parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str) + parser.add_argument('-t', dest='default_cancer_type', default='', type=str) + parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) + parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) + parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) + parser.set_defaults(func=main) + + args = parser.parse_args() + args.func(args) diff --git a/data/example_sv.txt b/data/example_sv.txt new file mode 100644 index 0000000..526466a --- /dev/null +++ b/data/example_sv.txt @@ -0,0 +1,8 @@ +Tumor_Sample_Barcode GeneA GeneB Sv_Type +TCGA-02-0033-01 MLL2 MLL2 DELETION +TCGA-05-4417-01 ALK EML4 FUSION +TCGA-06-0155-01 EGFR EGFR DELETION +TCGA-06-0155-01 TMPRSS2 ERG FUSION +TCGA-05-4417-01 TMPRSS2 ERG FUSION +TCGA-06-0155-01 ERBB2 ERBB2 DELETION +TCGA-02-0033-01 ETV6 RUNX1 FUSION diff --git a/example.sh b/example.sh index fa9379e..a141091 100644 --- a/example.sh +++ b/example.sh @@ -16,6 +16,9 @@ OATYPICALALT="data/example_atypical_alterations.oncokb.txt" IF="data/example_fusions.txt" OF="data/example_fusions.oncokb.txt" +ISV="data/example_sv.txt" +OSV="data/example_sv.oncokb.txt" + ICNA="data/example_cna.txt" OCNA="data/example_cna.oncokb.txt" @@ -38,7 +41,8 @@ python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN" python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN" python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN" +python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN" python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN" -python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF" +python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV" python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10 python GenerateReadMe.py -o "$README" diff --git a/test_Annotation.py b/test_Annotation.py index 21a4953..22d6a8f 100644 --- a/test_Annotation.py +++ b/test_Annotation.py @@ -193,7 +193,7 @@ def test_check_genomic_change(): assert annotation[HIGHEST_LEVEL_INDEX] == '' @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") -def test_check_fusions(): +def test_check_structural_variants(): queries = [ StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'), StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'),