Use individual CNA for testing

oncokb · Jun 2, 2022 · 5b131a0 · 5b131a0
1 parent 28dc61d
commit 5b131a0
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 19 deletions.
diff --git a/.github/workflows/compare-annotation.yml b/.github/workflows/compare-annotation.yml
@@ -43,7 +43,7 @@ jobs:
           MUTATION_DATA_NAME=data_mutations_mskcc.txt
           CLINICAL_DATA_NAME=data_clinical_sample.txt
           FUSION_DATA_NAME=data_fusions.txt
-          CNA_DATA_NAME=data_CNA.txt
+          INDIVIDUAL_CNA_DATA_NAME=data_individual_CNA.txt
 
           cd data || exit
           curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/data | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do
@@ -60,19 +60,16 @@ jobs:
           IMAF=data/"$MUTATION_DATA_NAME"
           OMAF=compare/"$PREFIX"_"$MUTATION_DATA_NAME"
 
-          IC=data/"$CLINICAL_DATA_NAME"
-          OC=compare/"$PREFIX"_"$CLINICAL_DATA_NAME"
-
           IF=data/"$FUSION_DATA_NAME"
           OF=compare/"$PREFIX"_"$FUSION_DATA_NAME"
 
-          ICNA=data/"$CNA_DATA_NAME"
-          OCNA=compare/"$PREFIX"_"$CNA_DATA_NAME"
+          IICNA=data/"$INDIVIDUAL_CNA_DATA_NAME"
+          OICNA=compare/"$PREFIX"_"$INDIVIDUAL_CNA_DATA_NAME"
 
           python MafAnnotator.py -i "$IMAF" -o "$OMAF" -c "$IC" -b "$ONCOKB_API_TOKEN"
           python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$ONCOKB_API_TOKEN"
-          python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$ONCOKB_API_TOKEN"
-          python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OCNA,$OF"
+          python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$ONCOKB_API_TOKEN" -f "individual"
+          python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OICNA,$OF"
 
           git config user.name oncokb-bot
           git config user.email dev.oncokb@gmail.com

diff --git a/AnnotatorCore.py b/AnnotatorCore.py
@@ -168,7 +168,9 @@ def setsampleidsfileterfile(f):
 HGVSP_SHORT_HEADER = 'HGVSP_SHORT'
 HGVSP_HEADER = 'HGVSP'
 HGVSG_HEADER = 'HGVSG'
-HGVS_HEADERS = [ALTERATION_HEADER, HGVSP_SHORT_HEADER, HGVSP_HEADER, HGVSG_HEADER, 'AMINO_ACID_CHANGE', 'FUSION']
+# columns for copy number alteration
+CNA_HEADERS = [ALTERATION_HEADER, 'COPY_NUMBER_ALTERATION', 'CNA', 'GISTIC']
+HGVS_HEADERS = [ALTERATION_HEADER, HGVSP_SHORT_HEADER, HGVSP_HEADER, HGVSG_HEADER, 'AMINO_ACID_CHANGE', 'FUSION'] + CNA_HEADERS
 SAMPLE_HEADERS = ['SAMPLE_ID', 'TUMOR_SAMPLE_BARCODE']
 PROTEIN_START_HEADERS = ['PROTEIN_START']
 PROTEIN_END_HEADERS = ['PROTEIN_END']
@@ -186,9 +188,6 @@ def setsampleidsfileterfile(f):
 GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
-# columns for copy number alteration
-CNA_HEADER = ['COPY_NUMBER_ALTERATION', 'CNA', 'GISTIC']
-
 # columns for structural variant annotation
 SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
 SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
@@ -976,7 +975,7 @@ def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTy
             isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
             ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
             icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
-            icna = geIndexOfHeader(headers, CNA_HEADER)
+            icna = geIndexOfHeader(headers, CNA_HEADERS)
 
             hugo = row[ihugo] if ihugo >= 0 else None
             cna_type = get_cna(row[icna], annotate_gain_loss)
@@ -1097,7 +1096,7 @@ def process_clinical_data(annotatedmutfiles, clinicalfile, outfile):
             ismutorcna = ihugo != -1 & ihgvs != -1
 
             if not isfusion and not ismutorcna:
-                log.error("missing proper header")
+                log.error("file " + annotatedmutfile + " missing proper header")
                 exit()
 
             for row in reader:

diff --git a/README.md b/README.md
@@ -33,8 +33,10 @@ We recommend processing VCF files by [vcf2maf](https://github.com/mskcc/vcf2maf/
 You can still use MAF format to annotate atypical alterations, such as MSI-H, TMB-H, EGFR vIII. Please see more examples [HERE](data/example_atypical_alterations.txt).  
 
 ### Copy Number Alteration
-We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt).
-
+#### Use GISTIC data format
+We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt).   
+Columns `Locus ID` and `Cytoband` are not required.
+#### Individual CNA
 You can also list copy number alteration individually by specifying `-f individual`, please see examples [HERE](data/example_individual_cna.txt).
 
 Get more details on the command line using `python CnaAnnotator.py -h`.  
@@ -103,11 +105,12 @@ python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN}
 ``` 
 
 
-## Columns added in the annotation files
+## Columns added in the annotation files using MafAnnotator/CnaAnnotator/StructuralVariantAnnotator/FusionAnnotator
 | Column                    | Possible Values                                                                                                                                                                     | Description                                                                                                                                                                                                                      |
 |---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GENE_IN_ONCOKB            | TRUE, FALSE                                                                                                                                                                         | Whether the gene has been curated by the OncoKB Team                                                                                                                                                                             |
-| VARIANT_IN_ONCOKB         | TRUE, FALSE                                                                                                                                                                         | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations.                                                                                                     |
+| ANNOTATED                 | True, False                                                                                                                                                                         | Whether the variant is annotated by OncoKB successfully                                                                                                                                                                          |
+| GENE_IN_ONCOKB            | True, False                                                                                                                                                                         | Whether the gene has been curated by the OncoKB Team                                                                                                                                                                             |
+| VARIANT_IN_ONCOKB         | True, False                                                                                                                                                                         | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations.                                                                                                     |
 | MUTATION_EFFECT           | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. |
 | MUTATION_EFFECT_CITATIONS | PMID, Abstract, Website Link                                                                                                                                                        | All citations related to the biological effect                                                                                                                                                                                   |
 | ONCOGENIC                 | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance                                                                                                      | In OncoKB, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014).                                                 |
@@ -123,5 +126,25 @@ python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN}
 | HIGHEST_PX_LEVEL          | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3                                                                                                                                                     | The highest level of evidence for prognostic implications                                                                                                                                                                        |
 | PX_CITATIONS              | PMID, Abstract, Website Link                                                                                                                                                        | All citations related to prognostic implications                                                                                                                                                                                 |
 
+## Columns added in the files using ClinicalDataAnnotator
+| Column                                              | Possible Values                                                   | Description                                                                                                                                |
+|-----------------------------------------------------|-------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|
+| LEVEL_*                                             | Therapeutic implications                                          | The leveled therapeutic implications                                                                                                       |
+| HIGHEST_LEVEL                                       | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 > LEVEL_R2 |
+| HIGHEST_SENSITIVE_LEVEL                             | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4                     | The highest sensitive level of evidence for therapeutic implications. Order: LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4             |
+| HIGHEST_RESISTANCE_LEVEL                            | LEVEL_R1, LEVEL_R2                                                | The highest resistance level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_R2                                          |
+| LEVEL_Dx*                                           | Tumor type the level of evidence is assigned to                   | The leveled diagnostic implications                                                                                                        |
+| HIGHEST_DX_LEVEL                                    | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3                                   | The highest level of evidence for diagnostic implications                                                                                  |
+| LEVEL_Px*                                           | Tumor type the level of evidence is assigned to                   | The leveled prognostic implications                                                                                                        |
+| HIGHEST_PX_LEVEL                                    | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3                                   | The highest level of evidence for prognostic implications                                                                                  |
+| ONCOGENIC_MUTATIONS                                 |                                                                   | The list of mutations that are Oncogenic or Likely Oncogenic                                                                               |
+| #ONCOGENIC_MUTATIONS                                |                                                                   | Number of oncogenic mutations                                                                                                              |
+| RESISTANCE_MUTATIONS                                |                                                                   | The list of resistance mutations                                                                                                           |
+| #RESISTANCE_MUTATIONS                               |                                                                   | Number of resistance mutations                                                                                                             |
+| #MUTATIONS_WITH_SENSITIVE_THERAPEUTIC_IMPLICATIONS  |                                                                   | Number of mutations in the sample with sensitive therapeutic implications                                                                  |
+| #MUTATIONS_WITH_RESISTANCE_THERAPEUTIC_IMPLICATIONS |                                                                   | Number of mutations in the sample with resistance therapeutic implications                                                                 |
+| #MUTATIONS_WITH_DIAGNOSTIC_IMPLICATIONS             |                                                                   | Number of mutations in the sample with diagnostic implications                                                                             |
+| #MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS             |                                                                   | Number of mutations in the sample with prognostic implications                                                                             |
+| #MUTATIONS                                          |                                                                   | Number of mutations in the sample                                                                                                          |
 ## Questions?
 The best way is to email contact@oncokb.org so all our team members can help.