Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support structural variant annotator #161

Merged
merged 6 commits into from
Apr 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 110 additions & 27 deletions AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ def setsampleidsfileterfile(f):
GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]

# columns for structural variant annotation
SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
SV_TYPE_HEADER = ['SV_CLASS_NAME', 'SV_TYPE']
SV_TYPES = ['DELETION', 'TRANSLOCATION', 'DUPLICATION', 'INSERTION', 'INVERSION', 'FUSION', 'UNKNOWN']

UNKNOWN = 'UNKNOWN'

class QueryType(Enum):
HGVSP_SHORT = 'HGVSP_SHORT'
Expand Down Expand Up @@ -649,19 +656,19 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
def getgenesfromfusion(fusion, nameregex=None):
GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex
searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE)
gene1=None
gene2=None
geneA=None
geneB=None
if searchresult:
parts = searchresult.group(1).split("-")
gene1 = parts[0]
gene2 = gene1
geneA = parts[0]
geneB = geneA
if len(parts) > 1 and parts[1] != "intragenic":
gene2 = parts[1]
geneB = parts[1]
else:
gene1=gene2=fusion
return gene1, gene2
geneA=geneB=fusion
return geneA, geneB

def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex):
if os.path.isfile(previousoutfile):
cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
outf = open(outfile, 'w+')
Expand All @@ -683,8 +690,8 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap

newcols = ncols + len(oncokb_annotation_headers)

igene1 = geIndexOfHeader(headers, ['GENE1'])
igene2 = geIndexOfHeader(headers, ['GENE2'])
igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
ifusion = geIndexOfHeader(headers, FUSION_HEADERS)
isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
Expand All @@ -704,20 +711,92 @@ def processsv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap
if sampleidsfilter and sample not in sampleidsfilter:
continue

gene1 = None
gene2 = None
if igene1 >= 0:
gene1 = row[igene1]
if igene2 >= 0:
gene2 = row[igene2]
if igene1 < 0 and igene2 < 0 and ifusion >= 0:
geneA = None
geneB = None
if igeneA >= 0:
geneA = row[igeneA]
if igeneB >= 0:
geneB = row[igeneB]
if igeneA < 0 and igeneB < 0 and ifusion >= 0:
fusion = row[ifusion]
gene1, gene2 = getgenesfromfusion(fusion, nameregex)
geneA, geneB = getgenesfromfusion(fusion, nameregex)

cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)


queries.append(StructuralVariantQuery(gene1, gene2, 'FUSION', cancertype))
queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype))
rows.append(row)

if len(queries) == POST_QUERIES_THRESHOLD:
annotations = pull_structural_variant_info(queries)
append_annotation_to_file(outf, newcols, rows, annotations)
queries = []
rows = []

if len(queries) > 0:
annotations = pull_structural_variant_info(queries)
append_annotation_to_file(outf, newcols, rows, annotations)
outf.close()

def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap):
if os.path.isfile(previousoutfile):
cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
outf = open(outfile, 'w+')
with open(svdata, 'rU') as infile:
reader = csv.reader(infile, delimiter='\t')

headers = readheaders(reader)

ncols = headers["length"]

if ncols == 0:
return

outf.write(headers['^-$'])
oncokb_annotation_headers = get_oncokb_annotation_column_headers()
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")

newcols = ncols + len(oncokb_annotation_headers)

igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER)
igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER)
isvtype = geIndexOfHeader(headers, SV_TYPE_HEADER)
isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)

i = 0
queries = []
rows = []
for row in reader:
i = i + 1
if i % POST_QUERIES_THRESHOLD == 0:
log.info(i)

row = padrow(row, ncols)

sample = row[isample]

if sampleidsfilter and sample not in sampleidsfilter:
continue

if igeneA < 0 or igeneB < 0:
log.warning("Please specify two genes")
continue

svtype = None
if isvtype >= 0:
svtype = row[isvtype].upper()
if svtype not in SV_TYPES:
svtype = None
if svtype is None:
svtype = UNKNOWN

cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)

sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype)
queries.append(sv_query)
rows.append(row)

if len(queries) == POST_QUERIES_THRESHOLD:
Expand Down Expand Up @@ -868,8 +947,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
if ncols == 0:
return

igene1 = geIndexOfHeader(headers, ['GENE1'] + HUGO_HEADERS) # fusion
igene2 = geIndexOfHeader(headers, ['GENE2'] + HUGO_HEADERS) # fusion
igeneA = geIndexOfHeader(headers, SV_GENEA_HEADER) # fusion
igeneB = geIndexOfHeader(headers, SV_GENEB_HEADER) # fusion
ifusion = geIndexOfHeader(headers, ['FUSION'])

ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
Expand All @@ -882,7 +961,7 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
# imutationeffect = headers['MUTATION_EFFECT']
ioncogenic = headers['ONCOGENIC']

isfusion = (igene1 != -1 & igene2 != -1) or ifusion != -1
isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1
ismutorcna = ihugo != -1 & ihgvs != -1

if not isfusion and not ismutorcna:
Expand Down Expand Up @@ -919,8 +998,8 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):

hugo = row[ihugo]
alteration = row[ihgvs]
gene1 = row[igene1]
gene2 = row[igene2]
geneA = row[igeneA]
geneB = row[igeneB]

variant = "NA"
if ismutorcna:
Expand All @@ -929,10 +1008,10 @@ def processclinicaldata(annotatedmutfiles, clinicalfile, outfile):
if ifusion != -1:
variant = row[ifusion]
else:
if gene1 == gene2:
variant = gene1 + " intragenic deletion"
if geneA == geneB:
variant = geneA + " intragenic deletion"
else:
variant = gene1 + "-" + gene2 + " fusion"
variant = geneA + "-" + geneB + " fusion"

if oncogenic == "oncogenic" or oncogenic == "likely oncogenic" or oncogenic == "predicted oncogenic":
sampledrivers[sample].append(variant)
Expand Down Expand Up @@ -1381,6 +1460,8 @@ def appendoncokbcitations(citations, pmids, abstracts):
class Gene:
def __init__(self, hugo):
self.hugoSymbol = hugo
def __str__(self):
return self.hugoSymbol


class ProteinChangeQuery:
Expand Down Expand Up @@ -1459,6 +1540,8 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):
self.functionalFusion = is_functional_fusion
self.structuralVariantType = structural_variant_type.upper()
self.tumorType = cancertype
def __str__(self):
return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType])


def pull_protein_change_info(queries, annotate_hotspot):
Expand Down
2 changes: 1 addition & 1 deletion FusionAnnotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main(argv):
readCancerTypes(argv.input_clinical_file, cancertypemap)

log.info('annotating %s ...' % argv.input_file)
processsv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
cancertypemap, argv.structural_variant_name_format)

log.info('done!')
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,21 @@ We use GISTIC 2.0 format. For more information, please see https://docs.cbioport
Get more details on the command line using `python CnaAnnotator.py -h`.

### Fusion
OncoKB offers to anntoate the strucutal variant. But in annotator, we only annotate the functional fusion.
OncoKB offers to annotate functional fusions.
The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`.
For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`.

Get more details on the command line using `python FusionAnnotator.py -h`.

### Structural Variant
OncoKB offers to annotate structural variant.
The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN.
All other types will be converted to UNKNOWN.

All structural variants with two different gene partners, they will be considered as functional fusions.

Get more details on the command line using `python StructuralVariantAnnotator.py -h`.

### Clinical Data (Combine MAF+CNA+Fusion)
You can comebine all annotation on sample/patient level using the clinical data annotator.

Expand Down
64 changes: 64 additions & 0 deletions StructuralVariantAnnotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/python

import argparse
from AnnotatorCore import *
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger('StructuralVariantAnnotator')

def main(argv):
if argv.help:
log.info('\n'
'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] [-b <oncokb api bear token>]\n'
' Essential structural variant columns (case insensitive):\n'
' GENEA: Hugo gene symbol for gene A\n'
' GENEB: Hugo gene symbol for gene B\n'
' SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
' TUMOR_SAMPLE_BARCODE: sample ID\n'
' Essential clinical columns:\n'
' SAMPLE_ID: sample ID\n'
' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
' Cancer type will be assigned based on the following priority:\n'
' 1) ONCOTREE_CODE in clinical data file\n'
' 2) ONCOTREE_CODE exist in structural variant\n'
' 3) default tumor type (-t)\n'
' Default OncoKB base url is https://www.oncokb.org')
sys.exit()
if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
log.info('for help: python StructuralVariantAnnotator.py -h')
sys.exit(2)
if argv.sample_ids_filter:
setsampleidsfileterfile(argv.sample_ids_filter)
if argv.cancer_hotspots_base_url:
setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
if argv.oncokb_api_url:
setoncokbbaseurl(argv.oncokb_api_url)
setoncokbapitoken(argv.oncokb_api_bearer_token)

cancertypemap = {}
if argv.input_clinical_file:
readCancerTypes(argv.input_clinical_file, cancertypemap)

log.info('annotating %s ...' % argv.input_file)
process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap)

log.info('done!')


if __name__ == "__main__":
parser = argparse.ArgumentParser(add_help=False)
# ArgumentParser doesn't accept "store_true" and "type=" at the same time.
parser.add_argument('-h', dest='help', action="store_true", default=False)
parser.add_argument('-i', dest='input_file', default='', type=str)
parser.add_argument('-o', dest='output_file', default='', type=str)
parser.add_argument('-p', dest='previous_result_file', default='', type=str)
parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
parser.set_defaults(func=main)

args = parser.parse_args()
args.func(args)
8 changes: 8 additions & 0 deletions data/example_sv.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Tumor_Sample_Barcode GeneA GeneB Sv_Type
TCGA-02-0033-01 MLL2 MLL2 DELETION
TCGA-05-4417-01 ALK EML4 FUSION
TCGA-06-0155-01 EGFR EGFR DELETION
TCGA-06-0155-01 TMPRSS2 ERG FUSION
TCGA-05-4417-01 TMPRSS2 ERG FUSION
TCGA-06-0155-01 ERBB2 ERBB2 DELETION
TCGA-02-0033-01 ETV6 RUNX1 FUSION
6 changes: 5 additions & 1 deletion example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ OATYPICALALT="data/example_atypical_alterations.oncokb.txt"
IF="data/example_fusions.txt"
OF="data/example_fusions.oncokb.txt"

ISV="data/example_sv.txt"
OSV="data/example_sv.oncokb.txt"

ICNA="data/example_cna.txt"
OCNA="data/example_cna.oncokb.txt"

Expand All @@ -38,7 +41,8 @@ python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN"
python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN"

python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF"
python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
python GenerateReadMe.py -o "$README"
2 changes: 1 addition & 1 deletion test_Annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_check_genomic_change():
assert annotation[HIGHEST_LEVEL_INDEX] == ''

@pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
def test_check_fusions():
def test_check_structural_variants():
queries = [
StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'),
StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'),
Expand Down