diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index d51cb3f..5a0f582 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -10,7 +10,24 @@ on: branches: [ master, next-minor-release ] jobs: - build: + lint: + name: Linting using flake8 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Run flake8 + uses: julianwachholz/flake8-action@v2 + with: + checkName: "Python Lint" + path: . + config: flake8.ini + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + pytest: + needs: lint runs-on: ${{ matrix.os }} strategy: matrix: @@ -27,15 +44,9 @@ jobs: PYTHON_VERSION: ${{ matrix.python-version }} run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install pytest if [[ $PYTHON_VERSION =~ ^2\.[0-9]$ ]]; then pip install -r requirements/common.txt -r requirements/pip2.7.txt; fi if [[ $PYTHON_VERSION =~ ^3\.[0-9]$ ]]; then pip install -r requirements/common.txt -r requirements/pip3.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest env: ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} @@ -58,7 +69,7 @@ jobs: PYTHON_VERSION: ${{ matrix.python-version }} run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install pytest if ( $env:PYTHON_VERSION -match '^2\.[0-9]$' ) { pip install -r requirements/common.txt -r requirements/pip2.7.txt @@ -67,12 +78,6 @@ jobs: { pip install -r requirements/common.txt -r requirements/pip3.txt } - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest env: ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} diff --git a/.version-level b/.version-level index acb503f..9eb7b90 100644 --- a/.version-level +++ b/.version-level @@ -1 +1 @@ -minor +patch diff --git a/AnnotatorCore.py b/AnnotatorCore.py index 38bfe2e..4fe038f 100644 --- a/AnnotatorCore.py +++ b/AnnotatorCore.py @@ -1,22 +1,18 @@ #!/usr/bin/python import datetime import json -import sys import csv -from enum import Enum - import requests import os.path import logging import re -import matplotlib +import ctypes as ct + +from enum import Enum from requests.adapters import HTTPAdapter from urllib3 import Retry - -matplotlib.use('Agg') -import matplotlib.pyplot as plt from datetime import date -import ctypes as ct + logging.basicConfig(level=logging.INFO) logging.getLogger("requests").setLevel(logging.WARNING) @@ -29,9 +25,9 @@ API_REQUEST_RETRY_STATUS_FORCELIST = [429, 500, 502, 503, 504] -csv.field_size_limit(int(ct.c_ulong(-1).value // 2)) # Deal with overflow problem on Windows, https://stackoverflow.co/120m/questions/15063936/csv-error-field-larger-than-field-limit-131072 +csv.field_size_limit(int(ct.c_ulong(-1).value // 2)) # Deal with overflow problem on Windows, https://stackoverflow.co/120m/questions/15063936/csv-error-field-larger-than-field-limit-131072 sizeLimit = csv.field_size_limit() -csv.field_size_limit(sizeLimit) # for reading large files +csv.field_size_limit(sizeLimit) # for reading large files oncokb_api_url = "https://www.oncokb.org/api" oncokb_annotation_api_url = oncokb_api_url + "/v1" @@ -45,21 +41,31 @@ def setoncokbbaseurl(u): oncokb_api_url = u.rstrip('/') + '/api' oncokb_annotation_api_url = oncokb_api_url + '/v1' + def setoncokbapitoken(t): global oncokb_api_bearer_token oncokb_api_bearer_token = t.strip() + cancerhotspotsbaseurl = "http://www.cancerhotspots.org" + + def setcancerhotspotsbaseurl(u): global cancerhotspotsbaseurl cancerhotspotsbaseurl = u + _3dhotspotsbaseurl = "http://www.3dhotspots.org" + + def set3dhotspotsbaseurl(u): global _3dhotspotsbaseurl _3dhotspotsbaseurl = u + sampleidsfilter = None + + def setsampleidsfileterfile(f): global sampleidsfilter content = [line.rstrip() for line in open(f)] @@ -186,7 +192,8 @@ def setsampleidsfileterfile(f): GC_REF_ALLELE_HEADER = 'REFERENCE_ALLELE' GC_VAR_ALLELE_1_HEADER = 'TUMOR_SEQ_ALLELE1' GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2' -GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER] +GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, + GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER] # columns for structural variant annotation SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1'] @@ -196,6 +203,7 @@ def setsampleidsfileterfile(f): UNKNOWN = 'UNKNOWN' + class QueryType(Enum): HGVSP_SHORT = 'HGVSP_SHORT' HGVSP = 'HGVSP' @@ -218,15 +226,18 @@ class ReferenceGenome(Enum): POST_QUERIES_THRESHOLD = 200 POST_QUERIES_THRESHOLD_GC_HGVSG = 100 + def getOncokbInfo(): - ret = ['Files annotated on ' + date.today().strftime('%m/%d/%Y') + "\nOncoKB API URL: "+oncokb_annotation_api_url] + ret = ['Files annotated on ' + date.today().strftime('%m/%d/%Y') + "\nOncoKB API URL: " + oncokb_annotation_api_url] try: info = requests.get(oncokb_annotation_api_url + "/info", timeout=REQUEST_TIMEOUT).json() - ret.append('\nOncoKB data version: ' + info['dataVersion']['version']+', released on ' + info['dataVersion']['date']) - except: + ret.append( + '\nOncoKB data version: ' + info['dataVersion']['version'] + ', released on ' + info['dataVersion']['date']) + except Exception: log.error("error when fetch OncoKB info") return ''.join(ret) + def validate_oncokb_token(): if oncokb_api_bearer_token is None or not oncokb_api_bearer_token: log.error("Please specify your OncoKB token") @@ -253,7 +264,7 @@ def validate_oncokb_token(): reason = response_json["title"] if response_json["detail"]: reason = response_json["detail"] - except: + except Exception: reason = response.reason log.error("Error when validating token, " + "reason: %s" % reason) @@ -265,6 +276,7 @@ def generateReadme(outfile): outf.write(getOncokbInfo()) outf.close() + def gethotspots(url, type): hotspots = {} response = requests.get(url, timeout=REQUEST_TIMEOUT) @@ -281,16 +293,16 @@ def gethotspots(url, type): for i in range(start, end + 1): hotspots[gene].add(i) else: - log.error("error when processing %s \n" % url + - "reason: %s" % response.reason) + log.error("error when processing %s \n" % url + "reason: %s" % response.reason) return hotspots + def requests_retry_session( - retries=3, - backoff_factor=0.3, - status_forcelist=API_REQUEST_RETRY_STATUS_FORCELIST, - allowed_methods=('GET', 'HEAD'), - session=None, + retries=3, + backoff_factor=0.3, + status_forcelist=API_REQUEST_RETRY_STATUS_FORCELIST, + allowed_methods=('GET', 'HEAD'), + session=None, ): session = session or requests.Session() retry = Retry( @@ -306,13 +318,15 @@ def requests_retry_session( session.mount('https://', adapter) return session + def makeoncokbpostrequest(url, body): headers = { 'Content-Type': 'application/json', 'Authorization': 'Bearer %s' % oncokb_api_bearer_token } - return requests_retry_session(allowed_methods=["POST"]).post(url, headers=headers, data=json.dumps(body, default=lambda o: o.__dict__), - timeout=REQUEST_TIMEOUT) + return requests_retry_session(allowed_methods=["POST"]).post(url, headers=headers, + data=json.dumps(body, default=lambda o: o.__dict__), + timeout=REQUEST_TIMEOUT) def makeoncokbgetrequest(url): @@ -325,9 +339,10 @@ def makeoncokbgetrequest(url): _3dhotspots = None + def init_3d_hotspots(): global _3dhotspots - _3dhotspots = gethotspots(_3dhotspotsbaseurl+"/api/hotspots/3d", None) + _3dhotspots = gethotspots(_3dhotspotsbaseurl + "/api/hotspots/3d", None) conversiondict = {'Ala': 'A', @@ -354,13 +369,16 @@ def init_3d_hotspots(): 'Glx': 'Z' } conversionlist = conversiondict.keys() + + def conversion(hgvs): - threecharactersearch = re.findall('[a-zA-Z]{3}\d+', hgvs, flags=re.IGNORECASE) + threecharactersearch = re.findall(r'[a-zA-Z]{3}\d+', hgvs, flags=re.IGNORECASE) if threecharactersearch: if any(letters.lower() in hgvs.lower() for letters in conversionlist): return replace_all(hgvs) return hgvs + def replace_all(hgvs): # Author: Thomas Glaessle pattern = re.compile('|'.join(conversionlist), re.IGNORECASE) @@ -391,10 +409,12 @@ def get_tumor_type_from_row(row, row_index, defaultCancerType, icancertype, canc if sample in cancerTypeMap: cancertype = cancerTypeMap[sample] if cancertype == "": - log.info("Cancer type for the sample should be defined for a more accurate result\nline %s: %s\n" % (row_index, row)) + log.info( + "Cancer type for the sample should be defined for a more accurate result\nline %s: %s\n" % (row_index, row)) # continue return cancertype + def has_desired_headers(desired_headers, file_headers): has_required_headers = True for header in desired_headers: @@ -417,7 +437,8 @@ def resolve_query_type(user_input_query_type, headers): if selected_query_type is None and HGVSG_HEADER in headers: selected_query_type = QueryType.HGVSG - if selected_query_type is None and has_desired_headers(REQUIRED_QUERY_TYPE_COLUMNS[QueryType.GENOMIC_CHANGE], headers): + if selected_query_type is None and has_desired_headers(REQUIRED_QUERY_TYPE_COLUMNS[QueryType.GENOMIC_CHANGE], + headers): selected_query_type = QueryType.GENOMIC_CHANGE # default to HGVSp_Short @@ -425,12 +446,15 @@ def resolve_query_type(user_input_query_type, headers): selected_query_type = QueryType.HGVSP_SHORT # check the file has required columns - if has_desired_headers(REQUIRED_QUERY_TYPE_COLUMNS[selected_query_type], headers) == False: + if has_desired_headers(REQUIRED_QUERY_TYPE_COLUMNS[selected_query_type], headers) is False: # when it is False, it will never be GENOMIC_CHANGE. For other types, we need to check whether ALTERATION column is available if ALTERATION_HEADER not in headers: - raise Exception("The file does not have required columns " - + ', '.join(REQUIRED_QUERY_TYPE_COLUMNS[user_input_query_type]) - + " for the query type: " + user_input_query_type.value) + raise Exception( + "The file does not have required columns " + + ', '.join(REQUIRED_QUERY_TYPE_COLUMNS[user_input_query_type]) + + " for the query type: " + + user_input_query_type.value + ) return selected_query_type @@ -469,7 +493,7 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy outf.write("\tIS-A-HOTSPOT") outf.write("\tIS-A-3D-HOTSPOT") newncols += 2 - + oncokb_annotation_headers = get_oncokb_annotation_column_headers() outf.write("\t") @@ -485,7 +509,8 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy cancerTypeMap, annotatehotspots, default_reference_genome) if (query_type == QueryType.HGVSP): - process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType, + process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols, + defaultCancerType, cancerTypeMap, annotatehotspots, default_reference_genome) if (query_type == QueryType.HGVSG): @@ -493,7 +518,8 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy cancerTypeMap, annotatehotspots, default_reference_genome) if (query_type == QueryType.GENOMIC_CHANGE): - process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap, annotatehotspots, default_reference_genome) + process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap, + annotatehotspots, default_reference_genome) outf.close() @@ -508,27 +534,30 @@ def get_cell_content(row, index, return_empty_string=False): def get_oncokb_annotation_column_headers(): - headers = [ANNOTATED_HEADER, GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT", "MUTATION_EFFECT_CITATIONS", + headers = [ANNOTATED_HEADER, GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT", + "MUTATION_EFFECT_CITATIONS", "ONCOGENIC"] - for l in sorted(levels): - headers.append(l) + for level in sorted(levels): + headers.append(level) headers.append("HIGHEST_LEVEL") headers.append("HIGHEST_SENSITIVE_LEVEL") headers.append("HIGHEST_RESISTANCE_LEVEL") headers.append("TX_CITATIONS") - for l in dxLevels: - headers.append(l) + for dx_level in dxLevels: + headers.append(dx_level) headers.append("HIGHEST_DX_LEVEL") headers.append("DX_CITATIONS") - for l in pxLevels: - headers.append(l) + for px_level in pxLevels: + headers.append(px_level) headers.append("HIGHEST_PX_LEVEL") headers.append("PX_CITATIONS") return headers -def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap, + +def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, + defaultCancerType, cancerTypeMap, annotatehotspots, default_reference_genome): ihugo = geIndexOfHeader(maf_headers, HUGO_HEADERS) iconsequence = geIndexOfHeader(maf_headers, CONSEQUENCE_HEADERS) @@ -538,7 +567,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names iend = geIndexOfHeader(maf_headers, PROTEIN_END_HEADERS) iproteinpos = geIndexOfHeader(maf_headers, PROTEIN_POSITION_HEADERS) icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS) - ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) + ireferencegenome = geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) posp = re.compile('[0-9]+') @@ -568,7 +597,8 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names hgvs = hgvs[2:] cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome) + reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), + default_reference_genome) hgvs = conversion(hgvs) @@ -599,15 +629,16 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD: - annotations = pull_protein_change_info(queries,annotatehotspots) + annotations = pull_protein_change_info(queries, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_protein_change_info(queries,annotatehotspots) + annotations = pull_protein_change_info(queries, annotatehotspots) append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) + # this method is from genome-nexus annotation-tools # https://github.com/genome-nexus/annotation-tools/blob/53ff7f7fe673e961282f871ebc78d2ecc0831919/standardize_mutation_data.py def get_var_allele(ref_allele, tumor_seq_allele1, tumor_seq_allele2): @@ -622,12 +653,14 @@ def get_var_allele(ref_allele, tumor_seq_allele1, tumor_seq_allele2): tumor_seq_allele = tumor_seq_allele2 elif ref_allele != tumor_seq_allele1: tumor_seq_allele = tumor_seq_allele1 - except: + except Exception: tumor_seq_allele = "" return tumor_seq_allele -def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap, annotatehotspots, default_reference_genome): + +def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap, + annotatehotspots, default_reference_genome): ichromosome = geIndexOfHeader(maf_headers, [GC_CHROMOSOME_HEADER]) istart = geIndexOfHeader(maf_headers, [GC_START_POSITION_HEADER]) iend = geIndexOfHeader(maf_headers, [GC_END_POSITION_HEADER]) @@ -637,9 +670,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS) icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS) - ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) - - posp = re.compile('[0-9]+') + ireferencegenome = geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) i = 0 queries = [] @@ -657,7 +688,8 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc continue cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome) + reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), + default_reference_genome) chromosome = get_cell_content(row, ichromosome, True) start = get_cell_content(row, istart, True) @@ -672,20 +704,22 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc rows.append(row) if len(queries) == POST_QUERIES_THRESHOLD_GC_HGVSG: - annotations = pull_genomic_change_info(queries,annotatehotspots) - append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations) + annotations = pull_genomic_change_info(queries, annotatehotspots) + append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_genomic_change_info(queries,annotatehotspots) - append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations) + annotations = pull_genomic_change_info(queries, annotatehotspots) + append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) -def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap, annotatehotspots, default_reference_genome): + +def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, + cancerTypeMap, annotatehotspots, default_reference_genome): ihgvsg = geIndexOfHeader(maf_headers, alteration_column_names) isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS) icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS) - ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) + ireferencegenome = geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS) i = 0 queries = [] @@ -705,7 +739,8 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol hgvsg = get_cell_content(row, ihgvsg) cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome) + reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), + default_reference_genome) if hgvsg is None: if annotatehotspots: @@ -721,20 +756,20 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol if len(queries) == POST_QUERIES_THRESHOLD_GC_HGVSG: annotations = pull_hgvsg_info(queries, annotatehotspots) - append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations) + append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) queries = [] rows = [] if len(queries) > 0: - annotations = pull_hgvsg_info(queries,annotatehotspots) - append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations) + annotations = pull_hgvsg_info(queries, annotatehotspots) + append_annotation_to_file(outf, ncols + nannotationcols, rows, annotations) def getgenesfromfusion(fusion, nameregex=None): - GENES_REGEX = "([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex + GENES_REGEX = r"([A-Za-z\d]+-[A-Za-z\d]+)" if nameregex is None else nameregex searchresult = re.search(GENES_REGEX, fusion, flags=re.IGNORECASE) - geneA=None - geneB=None + geneA = None + geneB = None if searchresult: parts = searchresult.group(1).split("-") geneA = parts[0] @@ -742,9 +777,10 @@ def getgenesfromfusion(fusion, nameregex=None): if len(parts) > 1 and parts[1] != "intragenic": geneB = parts[1] else: - geneA=geneB=fusion + geneA = geneB = fusion return geneA, geneB + def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap, nameregex): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) @@ -800,7 +836,6 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - queries.append(StructuralVariantQuery(geneA, geneB, 'FUSION', cancertype)) rows.append(row) @@ -814,7 +849,8 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy annotations = pull_structural_variant_info(queries) append_annotation_to_file(outf, newcols, rows, annotations) outf.close() - + + def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMap): if os.path.isfile(previousoutfile): cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap) @@ -857,11 +893,11 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa if sampleidsfilter and sample not in sampleidsfilter: continue - + if igeneA < 0 or igeneB < 0: log.warning("Please specify two genes") continue - + svtype = None if isvtype >= 0: svtype = row[isvtype].upper() @@ -871,7 +907,7 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa svtype = UNKNOWN cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample) - + sv_query = StructuralVariantQuery(row[igeneA], row[igeneB], svtype, cancertype) queries.append(sv_query) rows.append(row) @@ -1017,7 +1053,7 @@ def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cance annotate_gain_loss) else: headers, rows, queries = process_individual_cna_file(outf, cnafile, defaultCancerType, cancerTypeMap, - annotate_gain_loss) + annotate_gain_loss) ncols = len(headers) @@ -1032,6 +1068,7 @@ def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cance outf.close() + def getfirstcolumnofsampleingisticdata(headers): header0 = headers[0].lower() if header0 != "hugo_symbol" and header0 != "gene symbol": @@ -1083,13 +1120,8 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): ifusion = geIndexOfHeader(headers, ['FUSION']) ihugo = geIndexOfHeader(headers, HUGO_HEADERS) - iconsequence = geIndexOfHeader(headers, CONSEQUENCE_HEADERS) ihgvs = geIndexOfHeader(headers, HGVS_HEADERS) isample = geIndexOfHeader(headers, SAMPLE_HEADERS) - istart = geIndexOfHeader(headers, PROTEIN_START_HEADERS) - iend = geIndexOfHeader(headers, PROTEIN_END_HEADERS) - icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS) - # imutationeffect = headers['MUTATION_EFFECT'] ioncogenic = headers['ONCOGENIC'] isfusion = (igeneA != -1 & igeneB != -1) or ifusion != -1 @@ -1151,33 +1183,33 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): if oncogenic == "resistance": sample_resistance[sample].append(variant) - for l in levels: - il = geIndexOfHeader(headers, [l]) + for level in levels: + il = geIndexOfHeader(headers, [level]) if 0 <= il < len(row) and row[il] != '': - if l not in samplelevels[sample]: - samplelevels[sample][l] = [] - sampleleveltreatments[sample][l] = [] - samplelevels[sample][l].append(row[il] + "(" + variant + ")") - sampleleveltreatments[sample][l].extend(row[il].split(",")) + if level not in samplelevels[sample]: + samplelevels[sample][level] = [] + sampleleveltreatments[sample][level] = [] + samplelevels[sample][level].append(row[il] + "(" + variant + ")") + sampleleveltreatments[sample][level].extend(row[il].split(",")) - if l.startswith('LEVEL_R'): + if level.startswith('LEVEL_R'): sample_tx_resistance_count[sample][variant] = True else: sample_tx_sensitive_count[sample][variant] = True - for l in dxLevels: - il = geIndexOfHeader(headers, [l]) + for dx_level in dxLevels: + il = geIndexOfHeader(headers, [dx_level]) if 0 <= il < len(row) and row[il] != '': - if l not in samplelevels[sample]: - samplelevels[sample][l] = [] - samplelevels[sample][l].append(row[il] + "(" + variant + ")") + if dx_level not in samplelevels[sample]: + samplelevels[sample][dx_level] = [] + samplelevels[sample][dx_level].append(row[il] + "(" + variant + ")") - for l in pxLevels: - il = geIndexOfHeader(headers, [l]) + for px_level in pxLevels: + il = geIndexOfHeader(headers, [px_level]) if 0 <= il < len(row) and row[il] != '': - if l not in samplelevels[sample]: - samplelevels[sample][l] = [] - samplelevels[sample][l].append(row[il] + "(" + variant + ")") + if px_level not in samplelevels[sample]: + samplelevels[sample][px_level] = [] + samplelevels[sample][px_level].append(row[il] + "(" + variant + ")") ihighestdxlevel = geIndexOfHeader(headers, ['HIGHEST_DX_LEVEL']) if ihighestdxlevel != -1: @@ -1198,18 +1230,19 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): reader = csv.reader(clinfile, delimiter='\t') headers = readheaders(reader) outf.write(headers['^-$']) - for l in sorted(levels): - outf.write('\t' + l) + for level in sorted(levels): + outf.write('\t' + level) outf.write('\tHIGHEST_LEVEL') outf.write('\tHIGHEST_SENSITIVE_LEVEL') outf.write('\tHIGHEST_RESISTANCE_LEVEL') - for l in dxLevels: - outf.write('\t' + l) + for dx_level in dxLevels: + outf.write('\t' + dx_level) outf.write('\tHIGHEST_DX_LEVEL') - for l in pxLevels: - outf.write('\t' + l) + for px_level in pxLevels: + outf.write('\t' + px_level) outf.write('\tHIGHEST_PX_LEVEL') - outf.write('\tONCOGENIC_MUTATIONS\t#ONCOGENIC_MUTATIONS\tRESISTANCE_MUTATIONS\t#RESISTANCE_MUTATIONS\t#MUTATIONS_WITH_SENSITIVE_THERAPEUTIC_IMPLICATIONS\t#MUTATIONS_WITH_RESISTANCE_THERAPEUTIC_IMPLICATIONS\t#MUTATIONS_WITH_DIAGNOSTIC_IMPLICATIONS\t#MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS\t#MUTATIONS\n') + outf.write( + '\tONCOGENIC_MUTATIONS\t#ONCOGENIC_MUTATIONS\tRESISTANCE_MUTATIONS\t#RESISTANCE_MUTATIONS\t#MUTATIONS_WITH_SENSITIVE_THERAPEUTIC_IMPLICATIONS\t#MUTATIONS_WITH_RESISTANCE_THERAPEUTIC_IMPLICATIONS\t#MUTATIONS_WITH_DIAGNOSTIC_IMPLICATIONS\t#MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS\t#MUTATIONS\n') isample = headers['SAMPLE_ID'] for row in reader: @@ -1220,10 +1253,10 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): outf.write('\t'.join(row)) - for l in sorted(levels): + for level in sorted(levels): outf.write('\t') - if sample in samplelevels and l in samplelevels[sample]: - outf.write(";".join(samplelevels[sample][l])) + if sample in samplelevels and level in samplelevels[sample]: + outf.write(";".join(samplelevels[sample][level])) highestlevel = '' highest_sensitive_level = '' @@ -1247,20 +1280,19 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): outf.write('\t' + highest_sensitive_level) outf.write('\t' + highest_resistance_level) - for l in dxLevels: + for dx_level in dxLevels: outf.write('\t') - if sample in samplelevels and l in samplelevels[sample]: - outf.write(";".join(samplelevels[sample][l])) + if sample in samplelevels and dx_level in samplelevels[sample]: + outf.write(";".join(samplelevels[sample][dx_level])) outf.write('\t' + highestdxlevel) - for l in pxLevels: + for px_level in pxLevels: outf.write('\t') - if sample in samplelevels and l in samplelevels[sample]: - outf.write(";".join(samplelevels[sample][l])) + if sample in samplelevels and px_level in samplelevels[sample]: + outf.write(";".join(samplelevels[sample][px_level])) outf.write('\t' + highestpxlevel) - tx_sensitive_count = 0 tx_resistance_count = 0 if sample in sample_tx_sensitive_count: @@ -1303,222 +1335,6 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile): outf.close() -def plotclinicalactionability(ax, annotatedclinicalfile, outfile, parameters): - if os.path.isfile(outfile): - os.remove(outfile) - - extlevels = levels + ["ONCOGENIC", "VUS"] - if "levels" in parameters: - extlevels = parameters["levels"] - - with open(annotatedclinicalfile, 'rU') as clinfile: - reader = csv.reader(clinfile, delimiter='\t') - headers = readheaders(reader) - isample = geIndexOfHeader(headers, SAMPLE_HEADERS) - ilevel = headers['HIGHEST_LEVEL'] - ioncogenic = headers['ONCOGENIC_MUTATIONS'] - icat = headers[parameters["catogerycolumn"].upper()] #e.g. "CANCER_TYPE" - - catsamplecount = {} - catactionablesamplecount = {} - oncogenicsamplecount = {} - levelcatsamplecount = {} - - for row in reader: - sample = row[isample] - if sampleidsfilter and sample not in sampleidsfilter: - continue - - cat = row[icat] - if cat not in catsamplecount: - catsamplecount[cat] = 0 - catsamplecount[cat] += 1 - - if cat not in catactionablesamplecount: - catactionablesamplecount[cat] = 0 - oncogenicsamplecount[cat] = 0 - - level = row[ilevel] - oncogenic = row[ioncogenic] - - exlevel = level - - if level in extlevels: - catactionablesamplecount[cat] += 1 - oncogenicsamplecount[cat] += 1 - elif len(oncogenic.strip()) > 0: - oncogenicsamplecount[cat] += 1 - exlevel = "ONCOGENIC" - else: - exlevel = "VUS" - - if exlevel not in levelcatsamplecount: - levelcatsamplecount[exlevel] = {} - if cat not in levelcatsamplecount[exlevel]: - levelcatsamplecount[exlevel][cat] = 0 - levelcatsamplecount[exlevel][cat] += 1 - - - # plot - catarray = [] # cancer types - catactionabilityarray = [] # actionabiligy percentages per cancer type - catoncogenicarray = [] # actionabiligy percentages per cancer type - for cat in catsamplecount: - if catsamplecount[cat] >= parameters["thresholdcat"]: - catarray.append(cat) - catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) - catoncogenicarray.append(oncogenicsamplecount[cat] * 100.0 / catsamplecount[cat]) - - ncat = len(catarray) - order = reversed(sorted(range(ncat),key=lambda x:(catactionabilityarray[x],catoncogenicarray[x]))) - drawplot(ax, 'OncoKB Actionability', extlevels, levelcatsamplecount, catarray, catsamplecount, order, parameters["thresholdcat"]) - -def plotimplications(ax, header, title, levels, annotatedclinicalfile, outfile, parameters): - if os.path.isfile(outfile): - os.remove(outfile) - - extlevels = levels - if "levels" in parameters: - extlevels = parameters["levels"] - - with open(annotatedclinicalfile, 'rU') as clinfile: - reader = csv.reader(clinfile, delimiter='\t') - headers = readheaders(reader) - isample = headers['SAMPLE_ID'] - ilevel = headers[header] - icat = headers[parameters["catogerycolumn"].upper()] - - catsamplecount = {} - catactionablesamplecount = {} - levelcatsamplecount = {} - - for row in reader: - sample = row[isample] - if sampleidsfilter and sample not in sampleidsfilter: - continue - - cat = row[icat] - if cat not in catsamplecount: - catsamplecount[cat] = 0 - catsamplecount[cat] += 1 - - if cat not in catactionablesamplecount: - catactionablesamplecount[cat] = 0 - - level = row[ilevel] - - exlevel = level - - if level in extlevels: - catactionablesamplecount[cat] += 1 - else: - exlevel = "Other" - - if exlevel not in levelcatsamplecount: - levelcatsamplecount[exlevel] = {} - if cat not in levelcatsamplecount[exlevel]: - levelcatsamplecount[exlevel][cat] = 0 - levelcatsamplecount[exlevel][cat] += 1 - - - # plot - catarray = [] # cancer types - catactionabilityarray = [] # actionabiligy percentages per cancer type - for cat in catsamplecount: - if catsamplecount[cat] >= parameters["thresholdcat"]: - catarray.append(cat) - catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) - - ncat = len(catarray) - order = reversed(sorted(range(ncat),key=lambda x:(catactionabilityarray[x]))) - drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, parameters["thresholdcat"]) - -def drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, thresholdcat): - - # level colors - levelcolors = { - 'LEVEL_1': '#33A02C', - 'LEVEL_2': '#1F78B4', - 'LEVEL_3A': '#984EA3', - 'LEVEL_3B': '#BE98CE', - 'LEVEL_4': '#a8a8a8', - 'LEVEL_R1': '#EE3424', - 'LEVEL_R2': '#F79A92', - - 'LEVEL_Dx1': '#33A02C', - 'LEVEL_Dx2': '#1F78B4', - 'LEVEL_Dx3': '#984EA3', - - 'LEVEL_Px1': '#33A02C', - 'LEVEL_Px2': '#1F78B4', - 'LEVEL_Px3': '#984EA3', - - 'ONCOGENIC': '#ffdab9', - 'VUS': '#d1d1d1', - 'Other': 'grey' - } - - # level legend - levellegend = { - 'LEVEL_1': 'Level 1', - 'LEVEL_2': 'Level 2', - 'LEVEL_3A': 'Level 3A', - 'LEVEL_3B': 'Level 3B', - 'LEVEL_4': 'Level 4', - 'LEVEL_R1': 'Level R1', - 'LEVEL_R2': 'Level R2', - - 'LEVEL_Dx1': 'Level Dx1', - 'LEVEL_Dx2': 'Level Dx2', - 'LEVEL_Dx3': 'Level Dx3', - - 'LEVEL_Px1': 'Level Px1', - 'LEVEL_Px2': 'Level Px2', - 'LEVEL_Px3': 'Level Px3', - - 'ONCOGENIC': 'Oncogenic, no level', - 'VUS': 'VUS', - 'Other': 'Other' - } - - ncat = len(catarray) - if ncat > 0: - catarray = [catarray[i] for i in order] - - ind = range(ncat) - - legends = [] - plts = [] - accumlevelcancerperc = [0] * ncat - for level in extlevels: - if level not in levelcatsamplecount: - continue - - levelcancerperc = [0] * ncat - for k in ind: - cat = catarray[k] - if catsamplecount[cat] < thresholdcat: - continue - if cat in levelcatsamplecount[level]: - levelcancerperc[k] = levelcatsamplecount[level][cat] * 100.0 / catsamplecount[cat] - - width = 0.75 - plts = [ax.bar(ind, levelcancerperc, width, color=levelcolors[level], bottom=accumlevelcancerperc)] + plts - legends = [levellegend[level]] + legends - accumlevelcancerperc = list(map(sum, zip(accumlevelcancerperc,levelcancerperc))) - - ax = plt.gca() - ax.set_axisbelow(True) - ax.set_aspect(0.1) - - ax.tick_params(axis='y', which='major', labelsize=6) - ax.set_ylabel('% of samples', fontsize=6) - ax.set_title(title, fontsize=8) - ax.set_xticks([i+0.5 for i in ind]) - ax.set_xticklabels(catarray, rotation=60, ha="right", fontsize=4) - # plt.yticks(np.arange(0, 81, 10)) - ax.legend(plts, legends, fontsize=6, bbox_to_anchor=(1.01, 1), loc="upper left") - oncokbcache = {} @@ -1530,11 +1346,8 @@ def cacheannotated(annotatedfile, defaultCancerType, cancerTypeMap): headers = readheaders(reader) ihugo = geIndexOfHeader(headers, HUGO_HEADERS) - iconsequence = geIndexOfHeader(headers, CONSEQUENCE_HEADERS) ihgvs = geIndexOfHeader(headers, HGVS_HEADERS) isample = geIndexOfHeader(headers, SAMPLE_HEADERS) - istart = geIndexOfHeader(headers, PROTEIN_START_HEADERS) - iend = geIndexOfHeader(headers, PROTEIN_END_HEADERS) icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS) imutationeffect = headers['MUTATION_EFFECT'] icitations = headers['CITATIONS'] @@ -1565,17 +1378,18 @@ def cacheannotated(annotatedfile, defaultCancerType, cancerTypeMap): oncokbcache[key]['mutation_effect'] = row[imutationeffect] oncokbcache[key]['citations'] = row[icitations] oncokbcache[key]['oncogenic'] = row[ioncogenic] - for l in levels: - il = headers[l] + for level in levels: + il = headers[level] if il < len(row): - oncokbcache[key][l] = row[il].split(',') + oncokbcache[key][level] = row[il].split(',') else: - oncokbcache[key][l] = [] + oncokbcache[key][level] = [] except Exception: pass except Exception: pass + def geIndexOfHeader(headers, keywords): for k in keywords: if k in headers: @@ -1593,6 +1407,7 @@ def pull3dhotspots(hugo, consequence, start, end): log.error("%s: %s-%s" % (hugo, str(start), str(end))) return "" + def appendoncokbcitations(citations, pmids, abstracts): if citations is None: citations = [] @@ -1614,6 +1429,7 @@ def appendoncokbcitations(citations, pmids, abstracts): class Gene: def __init__(self, hugo): self.hugoSymbol = hugo + def __str__(self): return self.hugoSymbol @@ -1674,6 +1490,7 @@ def __init__(self, chromosome, start, end, ref_allele, var_allele, cancertype, r if reference_genome is not None: self.referenceGenome = reference_genome.value + class CNAQuery: def __init__(self, hugo, cnatype, cancertype): self.gene = Gene(hugo) @@ -1683,9 +1500,9 @@ def __init__(self, hugo, cnatype, cancertype): def __str__(self): return "\t".join([self.gene.hugoSymbol, self.copyNameAlterationType, self.tumorType]) + class StructuralVariantQuery: def __init__(self, hugoA, hugoB, structural_variant_type, cancertype): - # Assume all structural variants in the file are functional fusions is_functional_fusion = True if hugoA == hugoB: @@ -1697,8 +1514,11 @@ def __init__(self, hugoA, hugoB, structural_variant_type, cancertype): self.functionalFusion = is_functional_fusion self.structuralVariantType = structural_variant_type.upper() self.tumorType = cancertype + def __str__(self): - return "\t".join([self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, self.tumorType]) + return "\t".join( + [self.geneA.hugoSymbol, self.geneB.hugoSymbol, str(self.functionalFusion), self.structuralVariantType, + self.tumorType]) def pull_protein_change_info(queries, annotate_hotspot): @@ -1717,9 +1537,11 @@ def pull_protein_change_info(queries, annotate_hotspot): geturl += '&tumorType=' + query.tumorType if hasattr(query, 'consequence') and query.consequence: geturl += '&consequence=' + query.consequence - if hasattr(query, 'proteinStart') and query.proteinStart and query.proteinStart != '\\N' and query.proteinStart != 'NULL' and query.proteinStart != '': + if hasattr(query, + 'proteinStart') and query.proteinStart and query.proteinStart != '\\N' and query.proteinStart != 'NULL' and query.proteinStart != '': geturl += '&proteinStart=' + str(query.proteinStart) - if hasattr(query, 'proteinEnd') and query.proteinEnd and query.proteinEnd != '\\N' and query.proteinEnd != 'NULL' and query.proteinEnd != '': + if hasattr(query, + 'proteinEnd') and query.proteinEnd and query.proteinEnd != '\\N' and query.proteinEnd != 'NULL' and query.proteinEnd != '': geturl += '&proteinEnd=' + str(query.proteinEnd) getresponse = makeoncokbgetrequest(geturl) if getresponse.status_code == 200: @@ -1762,6 +1584,7 @@ def pull_hgvsg_info(queries, annotate_hotspot): processed_annotation.append(process_oncokb_annotation(query_annotation, annotate_hotspot)) return processed_annotation + def pull_genomic_change_info(queries, annotate_hotspot): url = oncokb_annotation_api_url + '/annotate/mutations/byGenomicChange' response = makeoncokbpostrequest(url, queries) @@ -1820,7 +1643,6 @@ def pull_cna_info(queries): return processed_annotation - def pull_structural_variant_info(queries): url = oncokb_annotation_api_url + '/annotate/structuralVariants' @@ -1836,7 +1658,8 @@ def pull_structural_variant_info(queries): geturl += 'hugoSymbolA=' + query.geneA.hugoSymbol geturl += '&hugoSymbolB=' + query.geneB.hugoSymbol geturl += '&structuralVariantType=' + query.structuralVariantType - geturl += '&isFunctionalFusion=' + str(query.functionalFusion).upper() if type(query.functionalFusion) is bool else query.functionalFusion + geturl += '&isFunctionalFusion=' + str(query.functionalFusion).upper() if type( + query.functionalFusion) is bool else query.functionalFusion geturl += '&tumorType=' + query.tumorType getresponse = makeoncokbgetrequest(geturl) @@ -1854,18 +1677,17 @@ def pull_structural_variant_info(queries): return processed_annotation - def process_oncokb_annotation(annotation, annotate_hotspot): if annotation is None: return ['False'] oncokbdata = {} - for l in levels: - oncokbdata[l] = [] - for l in dxLevels: - oncokbdata[l] = [] - for l in pxLevels: - oncokbdata[l] = [] + for level in levels: + oncokbdata[level] = [] + for dx_level in dxLevels: + oncokbdata[dx_level] = [] + for px_level in pxLevels: + oncokbdata[px_level] = [] oncokbdata[GENE_IN_ONCOKB_HEADER] = GENE_IN_ONCOKB_DEFAULT oncokbdata[VARIANT_IN_ONCOKB_HEADER] = VARIANT_IN_ONCOKB_DEFAULT @@ -1879,8 +1701,10 @@ def process_oncokb_annotation(annotation, annotate_hotspot): try: # oncogenic - oncokbdata[GENE_IN_ONCOKB_HEADER] = GENE_IN_ONCOKB_DEFAULT if annotation['geneExist'] is None else str(annotation['geneExist']) - oncokbdata[VARIANT_IN_ONCOKB_HEADER] = VARIANT_IN_ONCOKB_DEFAULT if annotation['variantExist'] is None else str(annotation['variantExist']) + oncokbdata[GENE_IN_ONCOKB_HEADER] = GENE_IN_ONCOKB_DEFAULT if annotation['geneExist'] is None else str( + annotation['geneExist']) + oncokbdata[VARIANT_IN_ONCOKB_HEADER] = VARIANT_IN_ONCOKB_DEFAULT if annotation['variantExist'] is None else str( + annotation['variantExist']) # oncogenic oncokbdata['oncogenic'] = annotation['oncogenic'] @@ -1892,8 +1716,10 @@ def process_oncokb_annotation(annotation, annotate_hotspot): if (annotation['mutationEffect'] is not None): oncokbdata['mutation_effect'] = annotation['mutationEffect']['knownEffect'] oncokbdata['mutation_effect_citations'] = appendoncokbcitations(oncokbdata['mutation_effect_citations'], - annotation['mutationEffect']['citations']['pmids'], - annotation['mutationEffect']['citations']['abstracts']) + annotation['mutationEffect']['citations'][ + 'pmids'], + annotation['mutationEffect']['citations'][ + 'abstracts']) # oncogenic oncokbdata['oncogenic'] = annotation['oncogenic'] @@ -1908,7 +1734,7 @@ def process_oncokb_annotation(annotation, annotate_hotspot): drugs = treatment['drugs'] oncokbdata['tx_citations'] = appendoncokbcitations(oncokbdata['tx_citations'], treatment['pmids'], - treatment['abstracts']) + treatment['abstracts']) if len(drugs) == 0: oncokbdata[level].append('[NOT SPECIFIED]') @@ -1927,10 +1753,8 @@ def process_oncokb_annotation(annotation, annotate_hotspot): oncokbdata['highestDiagnosticImplicationLevel'] = annotation['highestDiagnosticImplicationLevel'] oncokbdata['highestPrognosticImplicationLevel'] = annotation['highestPrognosticImplicationLevel'] - except: + except Exception: log.error("error when processing %s " % annotation) - # sys.exit() - ret = [] if annotate_hotspot: @@ -1939,7 +1763,8 @@ def process_oncokb_annotation(annotation, annotate_hotspot): else: ret.append('') - _3dhotspot = pull3dhotspots(annotation['query']['hugoSymbol'], annotation['query']['consequence'], annotation['query']['proteinStart'], annotation['query']['proteinEnd']) + _3dhotspot = pull3dhotspots(annotation['query']['hugoSymbol'], annotation['query']['consequence'], + annotation['query']['proteinStart'], annotation['query']['proteinEnd']) ret.append(_3dhotspot) ret.append('True') @@ -1948,20 +1773,20 @@ def process_oncokb_annotation(annotation, annotate_hotspot): ret.append(oncokbdata['mutation_effect']) ret.append(';'.join(oncokbdata['mutation_effect_citations'])) ret.append(oncokbdata['oncogenic']) - for l in sorted(levels): - ret.append(','.join(oncokbdata[l])) + for level in sorted(levels): + ret.append(','.join(oncokbdata[level])) ret.append(get_highest_tx_level(oncokbdata)) ret.append(get_highest_tx_level(oncokbdata, TX_TYPE_SENSITIVE)) ret.append(get_highest_tx_level(oncokbdata, TX_TYPE_RESISTANCE)) ret.append(';'.join(oncokbdata['tx_citations'])) - for l in dxLevels: - ret.append(','.join(oncokbdata[l])) + for dx_level in dxLevels: + ret.append(','.join(oncokbdata[dx_level])) ret.append(get_highest_dxpx_level(dxLevels, [oncokbdata['highestDiagnosticImplicationLevel']])) ret.append(';'.join(oncokbdata['dx_citations'])) - for l in pxLevels: - ret.append(','.join(oncokbdata[l])) + for px_level in pxLevels: + ret.append(','.join(oncokbdata[px_level])) ret.append(get_highest_dxpx_level(pxLevels, [oncokbdata['highestPrognosticImplicationLevel']])) ret.append(';'.join(oncokbdata['px_citations'])) @@ -1975,18 +1800,20 @@ def get_highest_tx_level(oncokb_data, tx_type=None): target_levels = sensitive_levels elif tx_type.lower() == TX_TYPE_RESISTANCE: target_levels = resistance_levels - for l in target_levels: - if l in oncokb_data and oncokb_data[l] is not None and len(oncokb_data[l]) > 0: - return l + for level in target_levels: + if level in oncokb_data and oncokb_data[level] is not None and len(oncokb_data[level]) > 0: + return level return "" + def get_highest_dxpx_level(dxpx_levels, oncokbdata): - for l in dxpx_levels: - if l not in oncokbdata: + for level in dxpx_levels: + if level not in oncokbdata: continue - return l + return level return "" + def gettreatments(evidence): treatments = [] for t in evidence['treatments']: @@ -2020,13 +1847,14 @@ def readheaders(reader): headers["length"] = len(row) i = 0 for h in row: - h=h.strip() + h = h.strip() headers[h.upper()] = i headers[h] = i i = i + 1 break return headers + def padrow(row, n): nr = len(row) if nr == n: diff --git a/ClinicalDataAnnotator.py b/ClinicalDataAnnotator.py index 2db61aa..a6bfadd 100644 --- a/ClinicalDataAnnotator.py +++ b/ClinicalDataAnnotator.py @@ -1,17 +1,25 @@ #!/usr/bin/python +import sys +import re import argparse -from AnnotatorCore import * import logging + +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import process_clinical_data + logging.basicConfig(level=logging.INFO) log = logging.getLogger('ClinicalDataAnnotator') + def main(argv): if argv.help: - log.info('\n' - 'ClinicalDataAnnotator.py -i -o -a [-s sample list filter]\n' - ' Essential clinical columns:\n' - ' SAMPLE_ID: sample ID') + log.info( + '\n' + 'ClinicalDataAnnotator.py -i -o -a [-s sample list filter]\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID' + ) sys.exit() if argv.sample_ids_filter: setsampleidsfileterfile(argv.sample_ids_filter) diff --git a/CnaAnnotator.py b/CnaAnnotator.py index 1407946..e1b4bae 100644 --- a/CnaAnnotator.py +++ b/CnaAnnotator.py @@ -1,25 +1,36 @@ #!/usr/bin/python +import sys import argparse -from AnnotatorCore import * import logging + +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import setoncokbbaseurl +from AnnotatorCore import setoncokbapitoken +from AnnotatorCore import readCancerTypes +from AnnotatorCore import validate_oncokb_token +from AnnotatorCore import process_cna_data +from AnnotatorCore import CNA_FILE_FORMAT_GISTIC + logging.basicConfig(level=logging.INFO) log = logging.getLogger('CnaAnnotator') def main(argv): if argv.help: - log.info('\n' - 'CnaAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss] [-f CNA file formt, gistic or individual]\n' - ' Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n' - ' Essential clinical columns:\n' - ' SAMPLE_ID: sample ID\n' - ' Cancer type will be assigned based on the following priority:\n' - ' 1) ONCOTREE_CODE in clinical data file\n' - ' 2) ONCOTREE_CODE exist in MAF\n' - ' 3) default tumor type (-t)\n' - ' We do not annotate Gain and Loss by default, add -z to include the analysis. See https://github.com/oncokb/oncokb-annotator/issues/51 for more information.\n' - ' Default OncoKB base url is https://www.oncokb.org') + log.info( + '\n' + 'CnaAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss] [-f CNA file formt, gistic or individual]\n' + ' Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID\n' + ' Cancer type will be assigned based on the following priority:\n' + ' 1) ONCOTREE_CODE in clinical data file\n' + ' 2) ONCOTREE_CODE exist in MAF\n' + ' 3) default tumor type (-t)\n' + ' We do not annotate Gain and Loss by default, add -z to include the analysis. See https://github.com/oncokb/oncokb-annotator/issues/51 for more information.\n' + ' Default OncoKB base url is https://www.oncokb.org' + ) sys.exit() if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': required_params = [] @@ -46,8 +57,7 @@ def main(argv): validate_oncokb_token() log.info('annotating %s ...' % argv.input_file) - process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, - cancertypemap, argv.annotate_gain_loss, argv.cna_file_format.lower()) + process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.annotate_gain_loss, argv.cna_file_format.lower()) log.info('done!') diff --git a/FusionAnnotator.py b/FusionAnnotator.py index 0ac1229..b1ef773 100644 --- a/FusionAnnotator.py +++ b/FusionAnnotator.py @@ -1,28 +1,40 @@ #!/usr/bin/python +import sys import argparse -from AnnotatorCore import * import logging + +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import setcancerhotspotsbaseurl +from AnnotatorCore import setoncokbbaseurl +from AnnotatorCore import setoncokbapitoken +from AnnotatorCore import readCancerTypes +from AnnotatorCore import validate_oncokb_token +from AnnotatorCore import process_fusion + logging.basicConfig(level=logging.INFO) log = logging.getLogger('FusionAnnotator') + def main(argv): if argv.help: - log.info('\n' - 'FusionAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ] [-r ]\n' - ' Essential Fusion columns (case insensitive):\n' - ' HUGO_SYMBOL: Hugo gene symbol\n' - ' VARIANT_CLASSIFICATION: Translational effect of variant allele\n' - ' TUMOR_SAMPLE_BARCODE: sample ID\n' - ' FUSION: amino acid change, e.g. "TMPRSS2-ERG"\n' - ' Essential clinical columns:\n' - ' SAMPLE_ID: sample ID\n' - ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' - ' Cancer type will be assigned based on the following priority:\n' - ' 1) ONCOTREE_CODE in clinical data file\n' - ' 2) ONCOTREE_CODE exist in Fusion\n' - ' 3) default tumor type (-t)\n' - ' Default OncoKB base url is https://www.oncokb.org') + log.info( + '\n' + "FusionAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ] [-r ]\n" + ' Essential Fusion columns (case insensitive):\n' + ' HUGO_SYMBOL: Hugo gene symbol\n' + ' VARIANT_CLASSIFICATION: Translational effect of variant allele\n' + ' TUMOR_SAMPLE_BARCODE: sample ID\n' + ' FUSION: amino acid change, e.g. "TMPRSS2-ERG"\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID\n' + ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' + ' Cancer type will be assigned based on the following priority:\n' + ' 1) ONCOTREE_CODE in clinical data file\n' + ' 2) ONCOTREE_CODE exist in Fusion\n' + ' 3) default tumor type (-t)\n' + ' Default OncoKB base url is https://www.oncokb.org' + ) sys.exit() if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': required_params = [] @@ -51,8 +63,7 @@ def main(argv): validate_oncokb_token() log.info('annotating %s ...' % argv.input_file) - process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, - cancertypemap, argv.structural_variant_name_format) + process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format) log.info('done!') diff --git a/GenerateReadMe.py b/GenerateReadMe.py index 700ece4..7bb8e84 100644 --- a/GenerateReadMe.py +++ b/GenerateReadMe.py @@ -1,8 +1,12 @@ #!/usr/bin/python +import sys import argparse -from AnnotatorCore import * import logging + +from AnnotatorCore import setoncokbbaseurl +from AnnotatorCore import generateReadme + logging.basicConfig(level=logging.INFO) log = logging.getLogger('GenerateReadMe') diff --git a/MafAnnotator.py b/MafAnnotator.py index 1356fbe..97c451e 100644 --- a/MafAnnotator.py +++ b/MafAnnotator.py @@ -1,9 +1,19 @@ #!/usr/bin/python +import sys import argparse -from AnnotatorCore import * import logging +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import setcancerhotspotsbaseurl +from AnnotatorCore import setoncokbbaseurl +from AnnotatorCore import setoncokbapitoken +from AnnotatorCore import readCancerTypes +from AnnotatorCore import validate_oncokb_token +from AnnotatorCore import processalterationevents +from AnnotatorCore import QueryType +from AnnotatorCore import ReferenceGenome + logging.basicConfig(level=logging.INFO) log = logging.getLogger('MafAnnotator') diff --git a/OncoKBPlots.py b/OncoKBPlots.py index 0b3fc89..153fec6 100644 --- a/OncoKBPlots.py +++ b/OncoKBPlots.py @@ -1,11 +1,242 @@ #!/usr/bin/python +import sys +import re import argparse -from AnnotatorCore import * import logging +import os +import csv +import matplotlib.pyplot as plt + +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import readheaders +from AnnotatorCore import geIndexOfHeader +from AnnotatorCore import sampleidsfilter +from AnnotatorCore import levels +from AnnotatorCore import dxLevels +from AnnotatorCore import pxLevels +from AnnotatorCore import SAMPLE_HEADERS + logging.basicConfig(level=logging.INFO) log = logging.getLogger('OncoKBPlots') -import matplotlib.pyplot as plt + + +def plotclinicalactionability(ax, annotatedclinicalfile, outfile, parameters): + if os.path.isfile(outfile): + os.remove(outfile) + + extlevels = levels + ["ONCOGENIC", "VUS"] + if "levels" in parameters: + extlevels = parameters["levels"] + + with open(annotatedclinicalfile, 'rU') as clinfile: + reader = csv.reader(clinfile, delimiter='\t') + headers = readheaders(reader) + isample = geIndexOfHeader(headers, SAMPLE_HEADERS) + ilevel = headers['HIGHEST_LEVEL'] + ioncogenic = headers['ONCOGENIC_MUTATIONS'] + icat = headers[parameters["catogerycolumn"].upper()] # e.g. "CANCER_TYPE" + + catsamplecount = {} + catactionablesamplecount = {} + oncogenicsamplecount = {} + levelcatsamplecount = {} + + for row in reader: + sample = row[isample] + if sampleidsfilter and sample not in sampleidsfilter: + continue + + cat = row[icat] + if cat not in catsamplecount: + catsamplecount[cat] = 0 + catsamplecount[cat] += 1 + + if cat not in catactionablesamplecount: + catactionablesamplecount[cat] = 0 + oncogenicsamplecount[cat] = 0 + + level = row[ilevel] + oncogenic = row[ioncogenic] + + exlevel = level + + if level in extlevels: + catactionablesamplecount[cat] += 1 + oncogenicsamplecount[cat] += 1 + elif len(oncogenic.strip()) > 0: + oncogenicsamplecount[cat] += 1 + exlevel = "ONCOGENIC" + else: + exlevel = "VUS" + + if exlevel not in levelcatsamplecount: + levelcatsamplecount[exlevel] = {} + if cat not in levelcatsamplecount[exlevel]: + levelcatsamplecount[exlevel][cat] = 0 + levelcatsamplecount[exlevel][cat] += 1 + + # plot + catarray = [] # cancer types + catactionabilityarray = [] # actionabiligy percentages per cancer type + catoncogenicarray = [] # actionabiligy percentages per cancer type + for cat in catsamplecount: + if catsamplecount[cat] >= parameters["thresholdcat"]: + catarray.append(cat) + catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) + catoncogenicarray.append(oncogenicsamplecount[cat] * 100.0 / catsamplecount[cat]) + + ncat = len(catarray) + order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x], catoncogenicarray[x]))) + drawplot(ax, 'OncoKB Actionability', extlevels, levelcatsamplecount, catarray, catsamplecount, order, + parameters["thresholdcat"]) + + +def plotimplications(ax, header, title, levels, annotatedclinicalfile, outfile, parameters): + if os.path.isfile(outfile): + os.remove(outfile) + + extlevels = levels + if "levels" in parameters: + extlevels = parameters["levels"] + + with open(annotatedclinicalfile, 'rU') as clinfile: + reader = csv.reader(clinfile, delimiter='\t') + headers = readheaders(reader) + isample = headers['SAMPLE_ID'] + ilevel = headers[header] + icat = headers[parameters["catogerycolumn"].upper()] + + catsamplecount = {} + catactionablesamplecount = {} + levelcatsamplecount = {} + + for row in reader: + sample = row[isample] + if sampleidsfilter and sample not in sampleidsfilter: + continue + + cat = row[icat] + if cat not in catsamplecount: + catsamplecount[cat] = 0 + catsamplecount[cat] += 1 + + if cat not in catactionablesamplecount: + catactionablesamplecount[cat] = 0 + + level = row[ilevel] + + exlevel = level + + if level in extlevels: + catactionablesamplecount[cat] += 1 + else: + exlevel = "Other" + + if exlevel not in levelcatsamplecount: + levelcatsamplecount[exlevel] = {} + if cat not in levelcatsamplecount[exlevel]: + levelcatsamplecount[exlevel][cat] = 0 + levelcatsamplecount[exlevel][cat] += 1 + + # plot + catarray = [] # cancer types + catactionabilityarray = [] # actionabiligy percentages per cancer type + for cat in catsamplecount: + if catsamplecount[cat] >= parameters["thresholdcat"]: + catarray.append(cat) + catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) + + ncat = len(catarray) + order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x]))) + drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, parameters["thresholdcat"]) + + +def drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, thresholdcat): + # level colors + levelcolors = { + 'LEVEL_1': '#33A02C', + 'LEVEL_2': '#1F78B4', + 'LEVEL_3A': '#984EA3', + 'LEVEL_3B': '#BE98CE', + 'LEVEL_4': '#a8a8a8', + 'LEVEL_R1': '#EE3424', + 'LEVEL_R2': '#F79A92', + + 'LEVEL_Dx1': '#33A02C', + 'LEVEL_Dx2': '#1F78B4', + 'LEVEL_Dx3': '#984EA3', + + 'LEVEL_Px1': '#33A02C', + 'LEVEL_Px2': '#1F78B4', + 'LEVEL_Px3': '#984EA3', + + 'ONCOGENIC': '#ffdab9', + 'VUS': '#d1d1d1', + 'Other': 'grey' + } + + # level legend + levellegend = { + 'LEVEL_1': 'Level 1', + 'LEVEL_2': 'Level 2', + 'LEVEL_3A': 'Level 3A', + 'LEVEL_3B': 'Level 3B', + 'LEVEL_4': 'Level 4', + 'LEVEL_R1': 'Level R1', + 'LEVEL_R2': 'Level R2', + + 'LEVEL_Dx1': 'Level Dx1', + 'LEVEL_Dx2': 'Level Dx2', + 'LEVEL_Dx3': 'Level Dx3', + + 'LEVEL_Px1': 'Level Px1', + 'LEVEL_Px2': 'Level Px2', + 'LEVEL_Px3': 'Level Px3', + + 'ONCOGENIC': 'Oncogenic, no level', + 'VUS': 'VUS', + 'Other': 'Other' + } + + ncat = len(catarray) + if ncat > 0: + catarray = [catarray[i] for i in order] + + ind = range(ncat) + + legends = [] + plts = [] + accumlevelcancerperc = [0] * ncat + for level in extlevels: + if level not in levelcatsamplecount: + continue + + levelcancerperc = [0] * ncat + for k in ind: + cat = catarray[k] + if catsamplecount[cat] < thresholdcat: + continue + if cat in levelcatsamplecount[level]: + levelcancerperc[k] = levelcatsamplecount[level][cat] * 100.0 / catsamplecount[cat] + + width = 0.75 + plts = [ax.bar(ind, levelcancerperc, width, color=levelcolors[level], bottom=accumlevelcancerperc)] + plts + legends = [levellegend[level]] + legends + accumlevelcancerperc = list(map(sum, zip(accumlevelcancerperc, levelcancerperc))) + + ax = plt.gca() + ax.set_axisbelow(True) + ax.set_aspect(0.1) + + ax.tick_params(axis='y', which='major', labelsize=6) + ax.set_ylabel('% of samples', fontsize=6) + ax.set_title(title, fontsize=8) + ax.set_xticks([i + 0.5 for i in ind]) + ax.set_xticklabels(catarray, rotation=60, ha="right", fontsize=4) + # plt.yticks(np.arange(0, 81, 10)) + ax.legend(plts, legends, fontsize=6, bbox_to_anchor=(1.01, 1), loc="upper left") + def main(argv): params = { @@ -13,14 +244,16 @@ def main(argv): "thresholdcat": argv.threshold_cat, # -n } if argv.help: - log.info('\n' - 'OncoKBPlots.py -i -o [-c ] [-s sample list filter] [-n threshold of # samples in a category] [-l comma separated levels to include]\n' - ' Essential clinical columns:\n' - ' SAMPLE_ID: sample ID\n' - ' HIGHEST_LEVEL: Highest OncoKB levels\n' - ' Supported levels (-l): \n' - ' LEVEL_1,LEVEL_2,LEVEL_3A,LEVEL_3B,LEVEL_4,ONCOGENIC,VUS') + log.info( + '\n' + 'OncoKBPlots.py -i -o [-c ] [-s sample list filter] [-n threshold of # samples in a category] [-l comma separated levels to include]\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID\n' + ' HIGHEST_LEVEL: Highest OncoKB levels\n' + ' Supported levels (-l): \n' + ' LEVEL_1,LEVEL_2,LEVEL_3A,LEVEL_3B,LEVEL_4,ONCOGENIC,VUS' + ) sys.exit() if argv.input_file == '' or argv.output_file == '': required_params = [] @@ -46,8 +279,10 @@ def main(argv): # plt.margins(0.01) plotclinicalactionability(ax1, args.input_file, args.output_file, params) - plotimplications(ax2, 'HIGHEST_DX_LEVEL', 'OncoKB Diagnostic Implications', dxLevels, args.input_file, argv.output_file, params) - plotimplications(ax3, 'HIGHEST_PX_LEVEL', 'OncoKB Prognostic Implications', pxLevels, args.input_file, argv.output_file, params) + plotimplications(ax2, 'HIGHEST_DX_LEVEL', 'OncoKB Diagnostic Implications', dxLevels, args.input_file, + argv.output_file, params) + plotimplications(ax3, 'HIGHEST_PX_LEVEL', 'OncoKB Prognostic Implications', pxLevels, args.input_file, + argv.output_file, params) plt.subplots_adjust(left=0.2, bottom=0.3) plt.gcf().text(0.90, 0.1, "Generated by OncoKB\n[Chakravarty et al., JCO PO 2017]", fontsize=6, @@ -57,6 +292,7 @@ def main(argv): log.info('done!') + if __name__ == "__main__": parser = argparse.ArgumentParser(add_help=False) parser.add_argument('-h', dest='help', action="store_true", default=False) diff --git a/StructuralVariantAnnotator.py b/StructuralVariantAnnotator.py index 1944c17..7e4775a 100644 --- a/StructuralVariantAnnotator.py +++ b/StructuralVariantAnnotator.py @@ -1,28 +1,40 @@ #!/usr/bin/python +import sys import argparse -from AnnotatorCore import * import logging + +from AnnotatorCore import setsampleidsfileterfile +from AnnotatorCore import setcancerhotspotsbaseurl +from AnnotatorCore import setoncokbbaseurl +from AnnotatorCore import setoncokbapitoken +from AnnotatorCore import readCancerTypes +from AnnotatorCore import validate_oncokb_token +from AnnotatorCore import process_sv + logging.basicConfig(level=logging.INFO) log = logging.getLogger('StructuralVariantAnnotator') + def main(argv): if argv.help: - log.info('\n' - 'StructuralVariantAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ]\n' - ' Essential structural variant columns (case insensitive):\n' - ' GENEA: Hugo gene symbol for gene A\n' - ' GENEB: Hugo gene symbol for gene B\n' - ' SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n' - ' TUMOR_SAMPLE_BARCODE: sample ID\n' - ' Essential clinical columns:\n' - ' SAMPLE_ID: sample ID\n' - ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' - ' Cancer type will be assigned based on the following priority:\n' - ' 1) ONCOTREE_CODE in clinical data file\n' - ' 2) ONCOTREE_CODE exist in structural variant\n' - ' 3) default tumor type (-t)\n' - ' Default OncoKB base url is https://www.oncokb.org') + log.info( + '\n' + 'StructuralVariantAnnotator.py -i -o [-p previous results] [-c ] [-s sample list filter] [-t ] [-u ] [-b ]\n' + ' Essential structural variant columns (case insensitive):\n' + ' GENEA: Hugo gene symbol for gene A\n' + ' GENEB: Hugo gene symbol for gene B\n' + ' SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n' + ' TUMOR_SAMPLE_BARCODE: sample ID\n' + ' Essential clinical columns:\n' + ' SAMPLE_ID: sample ID\n' + ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' + ' Cancer type will be assigned based on the following priority:\n' + ' 1) ONCOTREE_CODE in clinical data file\n' + ' 2) ONCOTREE_CODE exist in structural variant\n' + ' 3) default tumor type (-t)\n' + ' Default OncoKB base url is https://www.oncokb.org' + ) sys.exit() if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': required_params = [] diff --git a/flake8.ini b/flake8.ini new file mode 100644 index 0000000..28b9283 --- /dev/null +++ b/flake8.ini @@ -0,0 +1,2 @@ +[flake8] +ignore = E501,W503 \ No newline at end of file diff --git a/test_Annotation.py b/test_Annotation.py index 6d1c333..292067d 100644 --- a/test_Annotation.py +++ b/test_Annotation.py @@ -1,12 +1,25 @@ #!/usr/bin/python import pytest - -from AnnotatorCore import * import os +import logging + +from AnnotatorCore import pull_hgvsg_info +from AnnotatorCore import pull_genomic_change_info +from AnnotatorCore import pull_protein_change_info +from AnnotatorCore import pull_structural_variant_info +from AnnotatorCore import pull_cna_info +from AnnotatorCore import setoncokbapitoken +from AnnotatorCore import ProteinChangeQuery +from AnnotatorCore import GenomicChangeQuery +from AnnotatorCore import StructuralVariantQuery +from AnnotatorCore import CNAQuery +from AnnotatorCore import HGVSgQuery +from AnnotatorCore import ReferenceGenome ONCOKB_API_TOKEN = os.environ["ONCOKB_API_TOKEN"] setoncokbapitoken(ONCOKB_API_TOKEN) +log = logging.getLogger('test_Annotation') log.info('test-----------', os.environ["ONCOKB_API_TOKEN"], '------') VARIANT_EXISTS_INDEX = 2 @@ -21,6 +34,7 @@ UNKNOWN = 'Unknown' NUMBER_OF_ANNOTATION_COLUMNS = 27 + def fake_gene_one_query_suite(annotations): assert len(annotations) == 1 @@ -160,6 +174,7 @@ def test_check_hgvsg(): assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' + @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") def test_check_genomic_change(): queries = [ @@ -192,6 +207,7 @@ def test_check_genomic_change(): assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' assert annotation[HIGHEST_LEVEL_INDEX] == '' + @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") def test_check_structural_variants(): queries = [ @@ -282,6 +298,7 @@ def test_fake_cna(): annotations = pull_cna_info(queries) fake_gene_one_query_suite(annotations) + def check_brca2_s1882_without_cancertype(annotation): assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function' @@ -290,11 +307,12 @@ def check_brca2_s1882_without_cancertype(annotation): assert annotation[LEVEL_1_INDEX] == 'Olaparib,Olaparib+Bevacizumab,Rucaparib,Niraparib' assert annotation[LEVEL_2_INDEX] == 'Olaparib,Rucaparib,Niraparib' assert annotation[LEVEL_3A_INDEX] == 'Olaparib,Talazoparib' - + + @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") def test_duplicated_treatments(): # there should not be any duplicated treatment listed when cancer type is not specified - + # test protein change query queries = [ ProteinChangeQuery('BRCA2', 'S1882*', ''), diff --git a/test_AnnotatorCore.py b/test_AnnotatorCore.py index 381d7ae..5ef28bf 100644 --- a/test_AnnotatorCore.py +++ b/test_AnnotatorCore.py @@ -1,7 +1,29 @@ #!/usr/bin/python import pytest -from AnnotatorCore import * +from AnnotatorCore import getgenesfromfusion +from AnnotatorCore import conversion +from AnnotatorCore import replace_all +from AnnotatorCore import resolve_query_type +from AnnotatorCore import get_highest_tx_level +from AnnotatorCore import get_cna +from AnnotatorCore import QueryType +from AnnotatorCore import ALTERATION_HEADER +from AnnotatorCore import HGVSP_HEADER +from AnnotatorCore import HGVSP_SHORT_HEADER +from AnnotatorCore import HGVSG_HEADER +from AnnotatorCore import GC_REF_ALLELE_HEADER +from AnnotatorCore import GC_CHROMOSOME_HEADER +from AnnotatorCore import GC_START_POSITION_HEADER +from AnnotatorCore import GC_END_POSITION_HEADER +from AnnotatorCore import GC_VAR_ALLELE_1_HEADER +from AnnotatorCore import GC_VAR_ALLELE_2_HEADER +from AnnotatorCore import TX_TYPE_SENSITIVE +from AnnotatorCore import TX_TYPE_RESISTANCE +from AnnotatorCore import CNA_AMPLIFICATION_TXT +from AnnotatorCore import CNA_DELETION_TXT +from AnnotatorCore import CNA_GAIN_TXT +from AnnotatorCore import CNA_LOSS_TXT def test_getgenesfromfusion():