Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Commit

Permalink
Ependymoma subtyping missing WGS only samples (#860)
Browse files Browse the repository at this point in the history
* adding only WGS EPN samples to subset file

* removing unintended files

* add index to zscore array

* removing checks

* editing comments

* adding NA instead of blank

* removing subset rna dna and update fill_df_with_fpkm_zscores()

Co-authored-by: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
  • Loading branch information
kgaonkar6 and jaclyn-taroni authored Jan 10, 2021
1 parent 6fefa87 commit 608e905
Show file tree
Hide file tree
Showing 6 changed files with 1,182 additions and 254 deletions.
16 changes: 6 additions & 10 deletions analyses/molecular-subtyping-EPN/01-make_notebook_RNAandDNA.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,10 @@ def group_disease(primary_site):

# Filtering for ependymoma samples
EP = pbta_histologies[pbta_histologies["pathology_diagnosis"]=="Ependymoma"]
EP_rnaseq_samples = EP[EP["experimental_strategy"] == "RNA-Seq"][["Kids_First_Biospecimen_ID", "primary_site",
"Kids_First_Participant_ID", "sample_id", "experimental_strategy"]]
EP_rnaseq_samples["disease_group"] = [group_disease(primary) for primary in EP_rnaseq_samples["primary_site"]]

# List with only RNA samples
EP_rnasamplenames_PTIDs = list(EP_rnaseq_samples["Kids_First_Participant_ID"])
EP_rnaseq_samples = EP[EP["experimental_strategy"] == "RNA-Seq"][["Kids_First_Biospecimen_ID","Kids_First_Participant_ID", "sample_id","primary_site"]]

# Filtering for DNA samples
all_WGS = EP[EP["experimental_strategy"]=="WGS"]
WGSPT = all_WGS[all_WGS["Kids_First_Participant_ID"].isin(EP_rnasamplenames_PTIDs)]
WGS_dnaseqsamples = WGSPT[["Kids_First_Biospecimen_ID", "Kids_First_Participant_ID", "sample_id"]]
WGS_dnaseqsamples = EP[EP["experimental_strategy"] == "WGS"][["Kids_First_Biospecimen_ID", "Kids_First_Participant_ID", "sample_id","primary_site"]]


# Renaming the column name so they don't conflict in merge step
Expand All @@ -62,10 +55,13 @@ def group_disease(primary_site):
# sample_id is common between both datafarmes and also unique between RNA and DNA.
# Some DNA BSID's are missing for the corresponding RNA samples
EP_rnaseq_WGS = EP_rnaseq_samples.merge(WGS_dnaseqsamples,
on = ["sample_id", "Kids_First_Participant_ID"],
on = ["sample_id", "Kids_First_Participant_ID","primary_site"],
how = "outer")
EP_rnaseq_WGS.fillna('NA', inplace=True)

# add disease group infered from primary_site
EP_rnaseq_WGS["disease_group"] = [group_disease(primary) for primary in EP_rnaseq_WGS["primary_site"]]

# Sort for consistency
EP_rnaseq_WGS = EP_rnaseq_WGS.sort_values(by = ["Kids_First_Participant_ID", "sample_id"])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
focal_cn_gene_CDKN2A = focal_cn_gene_CDKN2A.set_index("biospecimen_id")

# Reading the input in a dataframe
EPN_notebook = pd.read_csv(args.notebook, sep="\t")
EPN_notebook = pd.read_csv(args.notebook, sep="\t",index_col=False)



Expand Down Expand Up @@ -112,9 +112,13 @@ def broad_CNA_fill_df(row, CNA, arm, loss_gain):

# Function to generate Z-scores column for every gene
def fill_df_with_fpkm_zscores(df, fpkmdf, gene_name):
zscore_list = stats.zscore(np.array(df.apply(lambda x: fpkmdf.loc[gene_name, x["Kids_First_Biospecimen_ID_RNA"]], axis=1)))
zscore_list = stats.zscore(np.array(df.loc[df["Kids_First_Biospecimen_ID_RNA"].notna(),:].apply(lambda x: fpkmdf.loc[gene_name, x["Kids_First_Biospecimen_ID_RNA"]], axis=1)))
column_name = gene_name + "_expr_zscore"
df[column_name] = pd.Series(zscore_list)
# add z-score array to df_rna column_name
df.loc[df["Kids_First_Biospecimen_ID_RNA"].notna(),column_name] = pd.Series(zscore_list,index=df[df["Kids_First_Biospecimen_ID_RNA"].notna()].index.array)
# add NA to expression zscore columns for dna only samples
df.loc[df["Kids_First_Biospecimen_ID_RNA"].isna(),column_name] = np.nan

return(df)


Expand Down Expand Up @@ -176,10 +180,11 @@ def fill_df_with_fpkm_zscores(df, fpkmdf, gene_name):

# Adding Z-scores to dataframe
expression_cols = ["RELA", "L1CAM", "ARL4D", "CLDN1", "CXorf67", "TKTL1", "GPBP1", "IFT46"]

for gene in expression_cols:
# rna matched dna samples
EPN_notebook = fill_df_with_fpkm_zscores(EPN_notebook, fpkm_df, gene)



# Replacing all Nan values with NA so they are not empty when writing to a file
EPN_notebook = EPN_notebook.replace(np.nan, 'NA', regex=True)
# Sort
Expand Down
29 changes: 16 additions & 13 deletions analyses/molecular-subtyping-EPN/03-subgrouping_samples.html
Original file line number Diff line number Diff line change
Expand Up @@ -13591,7 +13591,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
</thead>
<tbody>
<tr>
<th>79</th>
<th>83</th>
<td>PT_W17NV5YG</td>
<td>7316-2079</td>
<td>BS_FVPMPMRJ</td>
Expand Down Expand Up @@ -13720,7 +13720,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
</thead>
<tbody>
<tr>
<th>87</th>
<th>90</th>
<td>PT_ZA95JQEB</td>
<td>7316-384</td>
<td>BS_ZZJF26C4</td>
Expand All @@ -13744,7 +13744,7 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<td>PT_EPN_A</td>
</tr>
<tr>
<th>91</th>
<th>94</th>
<td>PT_ZZRBX5JT</td>
<td>7316-3319</td>
<td>BS_9N3B3HZB</td>
Expand Down Expand Up @@ -14220,8 +14220,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<td>-1.320692</td>
<td>-0.849661</td>
<td></td>
<td>0.022388</td>
<td>-0.307692</td>
<td>0.036298</td>
<td>-0.226415</td>
</tr>
<tr>
<th>1</th>
Expand Down Expand Up @@ -14268,8 +14268,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<td>0.388599</td>
<td>-0.282618</td>
<td></td>
<td>0.283582</td>
<td>0.230769</td>
<td>0.290381</td>
<td>0.301887</td>
</tr>
<tr>
<th>3</th>
Expand All @@ -14292,8 +14292,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<td>-1.125484</td>
<td>-0.701807</td>
<td></td>
<td>-0.216418</td>
<td>-0.307692</td>
<td>-0.196007</td>
<td>-0.226415</td>
</tr>
<tr>
<th>4</th>
Expand All @@ -14316,8 +14316,8 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<td>-1.123385</td>
<td>0.013901</td>
<td></td>
<td>-0.589552</td>
<td>0.153846</td>
<td>-0.558984</td>
<td>0.226415</td>
</tr>
</tbody>
</table>
Expand All @@ -14336,7 +14336,10 @@ <h3 id="Processing-EPN-Samples">Processing EPN Samples<a class="anchor-link" hre
<div class="prompt input_prompt">In&nbsp;[13]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">outfile</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">out</span><span class="p">:</span>
<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Replacing all Nan values with NA so they are not empty when writing to a file</span>
<span class="n">EPN_final</span> <span class="o">=</span> <span class="n">EPN_final</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">,</span> <span class="s1">&#39;NA&#39;</span><span class="p">,</span> <span class="n">regex</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">outfile</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">out</span><span class="p">:</span>
<span class="n">EPN_final</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">out</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot;</span><span class="se">\t</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
</pre></div>

Expand Down Expand Up @@ -14388,7 +14391,7 @@ <h2 id="Summary">Summary<a class="anchor-link" href="#Summary">&#182;</a></h2>


<div class="output_subarea output_stream output_stdout output_text">
<pre>There are a total of 94 samples out of which 59 samples were not assigned any subgroup
<pre>There are a total of 97 samples out of which 62 samples were not assigned any subgroup
Number of samples under each subgroup
ST_EPN_RELA : 29
ST_EPN_YAP1 : 4
Expand Down
Loading

0 comments on commit 608e905

Please sign in to comment.