Ependymoma subtyping missing WGS only samples (#860)

* adding only WGS EPN samples to subset file * removing unintended files * add index to zscore array * removing checks * editing comments * adding NA instead of blank * removing subset rna dna and update fill_df_with_fpkm_zscores() Co-authored-by: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
AlexsLemonade · Jan 10, 2021 · 608e905 · 608e905
1 parent 6fefa87
commit 608e905
Show file tree

Hide file tree

Showing 6 changed files with 1,182 additions and 254 deletions.
diff --git a/analyses/molecular-subtyping-EPN/01-make_notebook_RNAandDNA.py b/analyses/molecular-subtyping-EPN/01-make_notebook_RNAandDNA.py
@@ -41,17 +41,10 @@ def group_disease(primary_site):
 
 # Filtering for ependymoma samples 
 EP = pbta_histologies[pbta_histologies["pathology_diagnosis"]=="Ependymoma"]
-EP_rnaseq_samples = EP[EP["experimental_strategy"] == "RNA-Seq"][["Kids_First_Biospecimen_ID", "primary_site", 
-	"Kids_First_Participant_ID", "sample_id", "experimental_strategy"]]
-EP_rnaseq_samples["disease_group"] = [group_disease(primary) for primary in EP_rnaseq_samples["primary_site"]]
-
-# List with only RNA samples
-EP_rnasamplenames_PTIDs = list(EP_rnaseq_samples["Kids_First_Participant_ID"]) 
+EP_rnaseq_samples = EP[EP["experimental_strategy"] == "RNA-Seq"][["Kids_First_Biospecimen_ID","Kids_First_Participant_ID", "sample_id","primary_site"]]
 
 # Filtering for DNA samples 
-all_WGS = EP[EP["experimental_strategy"]=="WGS"]
-WGSPT = all_WGS[all_WGS["Kids_First_Participant_ID"].isin(EP_rnasamplenames_PTIDs)]
-WGS_dnaseqsamples = WGSPT[["Kids_First_Biospecimen_ID", "Kids_First_Participant_ID", "sample_id"]]
+WGS_dnaseqsamples = EP[EP["experimental_strategy"] == "WGS"][["Kids_First_Biospecimen_ID", "Kids_First_Participant_ID", "sample_id","primary_site"]]
 
 
 # Renaming the column name so they don't conflict in merge step 
@@ -62,10 +55,13 @@ def group_disease(primary_site):
 # sample_id is common between both  datafarmes and also unique between RNA and DNA. 
 # Some DNA BSID's are missing for the corresponding RNA samples
 EP_rnaseq_WGS = EP_rnaseq_samples.merge(WGS_dnaseqsamples, 
-                                        on = ["sample_id", "Kids_First_Participant_ID"], 
+                                        on = ["sample_id", "Kids_First_Participant_ID","primary_site"], 
                                         how = "outer")
 EP_rnaseq_WGS.fillna('NA', inplace=True)
 
+# add disease group infered from primary_site
+EP_rnaseq_WGS["disease_group"] = [group_disease(primary) for primary in EP_rnaseq_WGS["primary_site"]]
+
 # Sort for consistency
 EP_rnaseq_WGS = EP_rnaseq_WGS.sort_values(by = ["Kids_First_Participant_ID", "sample_id"])
 

diff --git a/analyses/molecular-subtyping-EPN/02_ependymoma_generate_all_data.py b/analyses/molecular-subtyping-EPN/02_ependymoma_generate_all_data.py
@@ -74,7 +74,7 @@
 focal_cn_gene_CDKN2A = focal_cn_gene_CDKN2A.set_index("biospecimen_id")
 
 # Reading the input in a  dataframe
-EPN_notebook = pd.read_csv(args.notebook, sep="\t")
+EPN_notebook = pd.read_csv(args.notebook, sep="\t",index_col=False)
 
 
 
@@ -112,9 +112,13 @@ def broad_CNA_fill_df(row, CNA, arm, loss_gain):
 
 # Function to generate Z-scores column for every gene
 def fill_df_with_fpkm_zscores(df, fpkmdf, gene_name):
-    zscore_list = stats.zscore(np.array(df.apply(lambda x: fpkmdf.loc[gene_name, x["Kids_First_Biospecimen_ID_RNA"]], axis=1)))
+    zscore_list = stats.zscore(np.array(df.loc[df["Kids_First_Biospecimen_ID_RNA"].notna(),:].apply(lambda x: fpkmdf.loc[gene_name, x["Kids_First_Biospecimen_ID_RNA"]], axis=1)))
     column_name = gene_name + "_expr_zscore"
-    df[column_name] = pd.Series(zscore_list)
+    # add z-score array to df_rna column_name
+    df.loc[df["Kids_First_Biospecimen_ID_RNA"].notna(),column_name] = pd.Series(zscore_list,index=df[df["Kids_First_Biospecimen_ID_RNA"].notna()].index.array)
+    # add NA to expression zscore columns for dna only samples
+    df.loc[df["Kids_First_Biospecimen_ID_RNA"].isna(),column_name] = np.nan
+
     return(df)
 
 
@@ -176,10 +180,11 @@ def fill_df_with_fpkm_zscores(df, fpkmdf, gene_name):
 
 # Adding Z-scores to dataframe
 expression_cols = ["RELA", "L1CAM", "ARL4D", "CLDN1", "CXorf67", "TKTL1", "GPBP1", "IFT46"]
+
 for gene in expression_cols:
+    # rna matched dna samples
     EPN_notebook = fill_df_with_fpkm_zscores(EPN_notebook, fpkm_df, gene)
-
-
+
 # Replacing all Nan values with NA so they are not empty when writing to a file
 EPN_notebook = EPN_notebook.replace(np.nan, 'NA', regex=True)
 # Sort

diff --git a/analyses/molecular-subtyping-EPN/03-subgrouping_samples.html b/analyses/molecular-subtyping-EPN/03-subgrouping_samples.html
@@ -13591,7 +13591,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
   </thead>
   <tbody>
     <tr>
-      <th>79</th>
+      <th>83</th>
       <td>PT_W17NV5YG</td>
       <td>7316-2079</td>
       <td>BS_FVPMPMRJ</td>
@@ -13720,7 +13720,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
   </thead>
   <tbody>
     <tr>
-      <th>87</th>
+      <th>90</th>
       <td>PT_ZA95JQEB</td>
       <td>7316-384</td>
       <td>BS_ZZJF26C4</td>
@@ -13744,7 +13744,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
       <td>PT_EPN_A</td>
     </tr>
     <tr>
-      <th>91</th>
+      <th>94</th>
       <td>PT_ZZRBX5JT</td>
       <td>7316-3319</td>
       <td>BS_9N3B3HZB</td>
@@ -14220,8 +14220,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
       <td>-1.320692</td>
       <td>-0.849661</td>
       <td></td>
-      <td>0.022388</td>
-      <td>-0.307692</td>
+      <td>0.036298</td>
+      <td>-0.226415</td>
     </tr>
     <tr>
       <th>1</th>
@@ -14268,8 +14268,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
       <td>0.388599</td>
       <td>-0.282618</td>
       <td></td>
-      <td>0.283582</td>
-      <td>0.230769</td>
+      <td>0.290381</td>
+      <td>0.301887</td>
     </tr>
     <tr>
       <th>3</th>
@@ -14292,8 +14292,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
       <td>-1.125484</td>
       <td>-0.701807</td>
       <td></td>
-      <td>-0.216418</td>
-      <td>-0.307692</td>
+      <td>-0.196007</td>
+      <td>-0.226415</td>
     </tr>
     <tr>
       <th>4</th>
@@ -14316,8 +14316,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
       <td>-1.123385</td>
       <td>0.013901</td>
       <td></td>
-      <td>-0.589552</td>
-      <td>0.153846</td>
+      <td>-0.558984</td>
+      <td>0.226415</td>
     </tr>
   </tbody>
 </table>
@@ -14336,7 +14336,10 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
 <div class="prompt input_prompt">In&nbsp;[13]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">outfile</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">out</span><span class="p">:</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Replacing all Nan values with NA so they are not empty when writing to a file</span>
+<span class="n">EPN_final</span> <span class="o">=</span> <span class="n">EPN_final</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">,</span> <span class="s1">&#39;NA&#39;</span><span class="p">,</span> <span class="n">regex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+
+<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">outfile</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">out</span><span class="p">:</span>
     <span class="n">EPN_final</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">out</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\t</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
 </pre></div>
 
@@ -14388,7 +14391,7 @@ <h2 id="Summary">Summary<a class="anchor-link" href="#Summary">&#182;</a></h2>
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>There are a total of 94 samples out of which 59 samples were not assigned any subgroup
+<pre>There are a total of 97 samples out of which 62 samples were not assigned any subgroup
 Number of samples under each subgroup
 ST_EPN_RELA : 29
 ST_EPN_YAP1 : 4