AlexsLemonade · jaclyn-taroni · Jan 10, 2020 · Dec 31, 2019 · Dec 31, 2019 · Dec 31, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -111,6 +111,13 @@ jobs:
          ################################
          #### Add your analysis here ####
          ################################
+      - run:
+          name: Evaluation plots for TP53/NF1 classifier polya
+          command: ./scripts/run_in_ci.sh python3 analyses/tp53_nf1_score/02-evaluate-classifier.py -s analyses/tp53_nf1_score/results/TP53_NF1_snv_alteration.tsv -f analyses/tp53_nf1_score/results/pbta-gene-expression-rsem-fpkm-collapsed.polya_classifier_scores.tsv -c data/pbta-histologies.tsv -o polya
+
+      - run:
+          name: Evaluation plots for TP53/NF1 classifier stranded
+          command: ./scripts/run_in_ci.sh python3 analyses/tp53_nf1_score/02-evaluate-classifier.py -s analyses/tp53_nf1_score/results/TP53_NF1_snv_alteration.tsv -f analyses/tp53_nf1_score/results/pbta-gene-expression-rsem-fpkm-collapsed.stranded_classifier_scores.tsv -c data/pbta-histologies.tsv -o stranded 		
 
       - run:
           name: SNV Caller Analysis 

diff --git a/analyses/tp53_nf1_score/02-evaluate-classifier.py b/analyses/tp53_nf1_score/02-evaluate-classifier.py
@@ -0,0 +1,176 @@
+import os
+import random
+from decimal import Decimal
+from scipy.stats import ttest_ind
+import numpy as np
+import pandas as pd
+
+from sklearn.metrics import average_precision_score, roc_auc_score
+from sklearn.metrics import roc_curve, precision_recall_curve
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+from optparse import OptionParser
+
+parser = OptionParser(usage="usage: %prog [options] arguments")
+parser.add_option(
+    "-s", "--statusfile", dest="status_file", help="TP53 and NF1 status file"
+)    
+parser.add_option(
+    "-f", "--file", dest="filename", help="scores output file "
+)
+parser.add_option(
+    "-c", "--clinical", dest="clinical", help="pbta-histologies.tsv clinical file"
+)    
+parser.add_option(
+    "-o", "--output_basename", dest="outputfile", help="output plots basename for TP53 and NF1 ROC curves"   
+)    
+
+(options, args) = parser.parse_args()
+status_file = options.status_file
+scores_file = options.filename
+clinical = options.clinical
+outputfilename = options.outputfile
+
+
+np.random.seed(123)
+
+status_df = pd.read_table(status_file,low_memory=False)
+
+# Value count of variant classification 
+print(status_df.Variant_Classification.value_counts())
+
+# Obtain a binary status matrix
+full_status_df = pd.crosstab(status_df['sample_id'], status_df.Hugo_Symbol,dropna=False)
+full_status_df.head(3)
+full_status_df[full_status_df > 1] = 1
+full_status_df = full_status_df.reset_index()
+full_status_df=full_status_df.drop(['No_TP53_NF1_alt'],axis=1)
+
+# read in clinical file
+clinical_df = pd.read_table(clinical)
+
+
+# add clinical info to TP53 and NF1 binary status df
+full_status_df = (
+    full_status_df
+    .assign(tp53_status = full_status_df.loc[:, 'TP53'],
+            nf1_status = full_status_df.loc[:, 'NF1'])
+)
+
+full_status_df = (
+    full_status_df.merge(
+        clinical_df,
+        how='left', left_on='sample_id', right_on='sample_id'
+    )
+)    
+
+
+# read in scores from 01
+file = os.path.join( scores_file)
+scores_df = pd.read_table(file)
+scores_df=scores_df.rename(str.upper, axis='columns')
+
+scores_df = (
+    scores_df.merge(
+        full_status_df,
+        how='left', left_on='SAMPLE_ID', right_on='Kids_First_Biospecimen_ID'
+    )
+)
+
+print("scores df shape")
+print(scores_df.shape)
+scores_df.tp53_status.value_counts()
+
+scores_df = (
+    scores_df
+    .assign(SAMPLE_ID = scores_df.loc[:, 'sample_id'])
+)
+
+
+gene_status = ['tp53_status','nf1_status']
+scores_df.loc[:, gene_status] = (
+    scores_df.loc[:, gene_status].fillna(0)
+)
+
+scores_df.loc[scores_df['tp53_status'] != 0, 'tp53_status'] = 1
+scores_df.loc[scores_df['nf1_status'] != 0, 'nf1_status'] = 1
+
+scores_df['tp53_status'] = scores_df['tp53_status'].astype(int)
+scores_df['nf1_status'] = scores_df['nf1_status'].astype(int)
+
+# binary counts for tp53 and nf1 loss status
+print ("TP53 status")
+print(scores_df.tp53_status.value_counts())
+print ("NF1 status")
+print(scores_df.nf1_status.value_counts())
+
+def get_roc_plot(scores_df, gene, outputfilename,color):
+    """
+    Show roc plot of classifier scores per gene
+
+    Arguments:
+    df - the dataframe of scores
+    gene - the name of the gene to input
+    outputfilename - the name of <filename>_ROC_plot.pdf 
+
+    """
+    lower_gene = gene.lower()
+    scores_df=scores_df.rename(str.lower, axis='columns')
+    # Obtain Metrics
+    sample_status = scores_df.loc[:,'{}_status'.format(lower_gene)]
+    sample_score = scores_df.loc[:,'{}_score'.format(lower_gene)]
+    shuffle_score = scores_df.loc[:,'{}_shuffle'.format(lower_gene)]
+    fpr_pdx, tpr_pdx, thresh_pdx=roc_curve(sample_status, sample_score, drop_intermediate=False)
+    precision_pdx, recall_pdx, _ = precision_recall_curve(sample_status, sample_score)
+    auroc_pdx = roc_auc_score(sample_status, sample_score)
+    aupr_pdx = average_precision_score(sample_status, sample_score)
+
+    # Obtain Shuffled Metrics
+    fpr_shuff, tpr_shuff, thresh_shuff = roc_curve(sample_status, shuffle_score, drop_intermediate=False)
+    precision_shuff, recall_shuff, _ = precision_recall_curve(sample_status, shuffle_score)
+    auroc_shuff = roc_auc_score(sample_status, shuffle_score)
+    aupr_shuff = average_precision_score(sample_status, shuffle_score)
+
+    roc_df = (
+        pd.DataFrame([fpr_pdx, tpr_pdx, thresh_pdx], index=['fpr', 'tpr', 'threshold'])
+        .transpose()
+        .assign(gene=gene,
+                shuffled=False)
+    )
+    plt.subplots(figsize=(5, 5))
+    plt.axis('equal')
+    plt.plot([0, 1], [0, 1], 'k--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.0])
+    plt.plot(fpr_pdx, tpr_pdx,
+             label='{} (AUROC = {})'.format(gene, round(auroc_pdx, 2)),
+             linestyle='solid',
+             color=color)
+
+    # Shuffled Data
+    plt.plot(fpr_shuff, tpr_shuff,
+             label='{} Shuffle (AUROC = {})'.format(gene, round(auroc_shuff, 2)),
+             linestyle='dotted',
+             color=color)
+
+    plt.xlabel('False Positive Rate', fontsize=12)
+    plt.ylabel('True Positive Rate', fontsize=12)
+    plt.tick_params(labelsize=10)
+
+    lgd = plt.legend(bbox_to_anchor=(0.3, 0.15),
+                 loc=2,
+                 borderaxespad=0.,
+                 fontsize=10)
+    plt.savefig(outputfilename +'_'+gene+'.png')
+
+outputfilename = os.path.join("analyses", "tp53_nf1_score", "results", outputfilename)    
+
+get_roc_plot(scores_df, gene = "TP53", outputfilename = outputfilename , color = '#7570b3')
+
+get_roc_plot(scores_df, gene = "NF1", outputfilename = outputfilename , color = '#d95f02')
+
+
+
+
+
diff --git a/analyses/tp53_nf1_score/README.md b/analyses/tp53_nf1_score/README.md
@@ -8,6 +8,9 @@ Now published in [Rokita et al. _Cell Reports._ 2019.](https://doi.org/10.1016/j
 In brief, _TP53_ inactivation, _NF1_ inactivation, and Ras activation classifiers are applied to the stranded and polya OpenPBTA RNA-seq data.
 The classifiers were trained on TCGA PanCan data ([Way et al. _Cell Reports._ 2018](https://doi.org/10.1016/j.celrep.2018.03.046), [Knijnenburg et al. _Cell Reports._ 2018.](https://doi.org/10.1016/j.celrep.2018.03.076)).
 See [`01-apply-classifier.py`](01-apply-classifier.py) for more information about the procedure.
+To evaluate the classifier scores [`02-evaluate-classifier.py`](02-evaluate-classifier.py) uses SNV data to identify true TP53/NF1 loss samples and compares scores of shuffled data to true calls and plots ROC curves. 
-To evaluate the classifier scores [`02-evaluate-classifier.py`](02-evaluate-classifier.py) uses SNV data to identify true TP53/NF1 loss samples and compares scores of shuffled data to true calls and plots ROC curves. 
+To evaluate the classifier scores, we use [`02-evaluate-classifier.py`](02-evaluate-classifier.py) and input SNV data to identify true TP53/NF1 loss samples and compare scores of shuffled data to true calls and plots ROC curves. 
-To evaluate the classifier scores [`02-evaluate-classifier.py`](02-evaluate-classifier.py) uses SNV data to identify true TP53/NF1 loss samples and compares scores of shuffled data to true calls and plots ROC curves. 
+To evaluate the classifier scores, we use [`02-evaluate-classifier.py`](02-evaluate-classifier.py) and input SNV data to identify true TP53/NF1 loss samples and compare scores of shuffled data to true calls and plots ROC curves. 
+
+
 
 #### Running the analysis
 
@@ -22,3 +25,17 @@ bash analyses/tp53_nf1_score/run_classifier.sh
 It produces  `results/pbta-gene-expression-rsem-fpkm-collapsed.stranded_classifier_scores.tsv`  and `results/pbta-gene-expression-rsem-fpkm-collapsed.polya_classifier_scores.tsv`, which contains all 3 classifier scores for the stranded data and for shuffled stranded (e.g., random) data.
 
 Because some of the classifier genes are not present in the OpenPBTA dataset, the scores should be interpreted as continuous values representing relative gene alterations and not as probabilities.
+
+ROC curve for TP53 classifier scores for stranded RNAseq data
+![stranded RNAseq TP53 classifier ROC](https://github.com/kgaonkar6/OpenPBTA-analysis/blob/validation_step/analyses/tp53_nf1_score/results/stranded_TP53.png)
+
+ROC curve for TP53 classifier scores for polya RNAseq data
+![polya RNAseq TP53 classifier ROC](https://github.com/kgaonkar6/OpenPBTA-analysis/blob/validation_step/analyses/tp53_nf1_score/results/polya_TP53.png)
+
+ROC curve for NF1 classifier scores for stranded RNAseq data
+![stranded RNAseq NF1 classifier ROC](https://github.com/kgaonkar6/OpenPBTA-analysis/blob/validation_step/analyses/tp53_nf1_score/results/stranded_NF1.png)
+
+ROC curve for NF1 classifier scores for polya RNASeq data
+![polya RNASeq NF1 classifier ROC](https://github.com/kgaonkar6/OpenPBTA-analysis/blob/validation_step/analyses/tp53_nf1_score/results/polya_NF1.png)
+
+