-
Notifications
You must be signed in to change notification settings - Fork 67
Add chr22q
loss variable to ATRT molecular subtyping
#414
Changes from all commits
d3d27d5
728b6fa
5750c00
6425ddb
c3da786
8bef332
76c5c1a
f4e9a5c
fedd788
8411796
57768d3
1ac4e91
846d4f2
3a46c7e
dc4c62e
7f01263
b7efffa
c3a5447
40902a8
504e89e
5c65f37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,14 +52,18 @@ if (!dir.exists(results_dir)) { | |
dir.create(results_dir) | ||
} | ||
|
||
# Read in non-subsetted metadata | ||
metadata <- readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) | ||
|
||
# Read in ATRT subset metadata | ||
metadata <- | ||
readr::read_tsv(file.path(input_dir, "atrt_histologies.tsv")) | ||
subset_metadata <- metadata %>% | ||
dplyr::filter(short_histology == "ATRT", | ||
sample_type == "Tumor", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Getting rid of the normal and cell line data here. |
||
composition == "Solid Tissue") | ||
|
||
# Select wanted columns in metadata for merging and assign to a new object | ||
select_metadata <- metadata %>% | ||
dplyr::select(sample_id, | ||
Kids_First_Participant_ID, | ||
Kids_First_Biospecimen_ID) | ||
|
||
# Read in ATRT subset GSVA pathway scores | ||
|
@@ -87,6 +91,11 @@ cn_df <- readr::read_tsv( | |
tmb_df <- | ||
data.table::fread(file.path(input_dir, | ||
"atrt_tmb.tsv")) | ||
|
||
# Read in ATRT subset GISTIC data | ||
gistic_df <- | ||
readr::read_tsv(file.path(input_dir, "atrt_gistic_broad_values.tsv")) | ||
|
||
``` | ||
|
||
## Custom Function | ||
|
@@ -151,15 +160,14 @@ infratentorial <- | |
"Other locations NOS;Spinal Cord- Lumbar/Thecal Sac;Spinal Cord- Thoracic;Ventricles" | ||
) | ||
|
||
metadata <- metadata %>% | ||
collapsed_metadata <- subset_metadata %>% | ||
dplyr::mutate( | ||
location_summary = dplyr::case_when( | ||
primary_site %in% infratentorial ~ "infratentorial", | ||
primary_site %in% supratentorial ~ "supratentorial", | ||
TRUE ~ "NA" | ||
) | ||
) %>% | ||
dplyr::group_by(sample_id) %>% | ||
dplyr::select( | ||
sample_id, | ||
Kids_First_Biospecimen_ID, | ||
|
@@ -168,14 +176,19 @@ metadata <- metadata %>% | |
age_at_diagnosis_days, | ||
germline_sex_estimate, | ||
primary_site | ||
) | ||
) %>% | ||
dplyr::group_by( | ||
sample_id, | ||
Kids_First_Participant_ID | ||
) %>% | ||
dplyr::summarize_all(function(x) paste(sort(unique(x)), collapse = ", ")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cbethell if we do this collapse biospecimen IDs step up front, we can drop the biospecimen ID columns as we go since we join on |
||
|
||
# Display metadata subsetted for ATRT samples | ||
metadata %>% | ||
collapsed_metadata %>% | ||
head(n = 15) | ||
``` | ||
|
||
## Filter and join RNA expression, CN, TMB, and ssGSEA data | ||
## Filter and join RNA expression, CN, TMB, ssGSVA and GISTIC data | ||
|
||
### RNA expression data | ||
|
||
|
@@ -241,50 +254,60 @@ long_stranded_expression <- scale(t(filtered_expression), | |
# Merge metadata with expression data | ||
expression_metadata <- long_stranded_expression %>% | ||
as.data.frame() %>% | ||
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>% | ||
dplyr::left_join(select_metadata, by = "Kids_First_Biospecimen_ID") | ||
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>% | ||
dplyr::left_join(select_metadata, by = "Kids_First_Biospecimen_ID") %>% | ||
dplyr::select(-Kids_First_Biospecimen_ID) | ||
|
||
# Display `expression_metadata` | ||
expression_metadata %>% | ||
head(n = 15) | ||
|
||
# Join expression data with metadata filtered for `ATRT` | ||
atrt_expression_df <- metadata %>% | ||
atrt_expression_df <- collapsed_metadata %>% | ||
dplyr::left_join(expression_metadata, | ||
by = "sample_id") | ||
|
||
# Remove data we no longer need | ||
rm(filtered_expression, long_stranded_expression, expression_metadata) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing from the workspace as we go makes it such that we don't have |
||
``` | ||
|
||
### CN data | ||
|
||
```{r} | ||
# Filter focal CN data for SMARCB1 and SMARCA4 status | ||
cn_df <- cn_df %>% | ||
# Filter focal CN data for SMARCB1 and SMARCA4 status | ||
genes_cn_df <- cn_df %>% | ||
dplyr::filter(gene_symbol %in% c("SMARCB1", "SMARCA4")) %>% | ||
dplyr::mutate( | ||
SMARCB1_focal_status = dplyr::case_when(gene_symbol == "SMARCB1" ~ status, | ||
TRUE ~ "neutral"), | ||
SMARCA4_focal_status = dplyr::case_when(gene_symbol == "SMARCA4" ~ status, | ||
TRUE ~ "neutral") | ||
) %>% | ||
dplyr::select(-c("status", "gene_symbol")) %>% | ||
dplyr::distinct() %>% | ||
dplyr::group_by(sample_id) %>% | ||
dplyr::summarise( | ||
SMARCB1_focal_status = paste(sort(unique( | ||
SMARCB1_focal_status | ||
)), collapse = ", "), | ||
SMARCA4_focal_status = paste(sort(unique( | ||
SMARCA4_focal_status | ||
)), collapse = ", ") | ||
) | ||
|
||
# Display `cn_metadata` | ||
cn_df %>% | ||
tidyr::spread(gene_symbol, status, fill = "neutral") %>% | ||
dplyr::rename(SMARCB1_focal_status = SMARCB1, | ||
SMARCA4_focal_status = SMARCA4) | ||
|
||
# add in the samples that did not have any copy number changes for either | ||
jharenza marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# of these genes | ||
missing_cn_samples <- setdiff(unique(cn_df$sample_id), genes_cn_df$sample_id) | ||
genes_cn_df <- subset_metadata %>% | ||
dplyr::filter(sample_id %in% missing_cn_samples) %>% | ||
dplyr::select(sample_id, | ||
Kids_First_Participant_ID, | ||
Kids_First_Biospecimen_ID) %>% | ||
dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>% | ||
# bind rows fills with NAs -- if something is NA, there is no evidence for | ||
# a copy number alteration in that gene and we will call it neutral | ||
dplyr::bind_rows(genes_cn_df) %>% | ||
dplyr::mutate_all(function(x) tidyr::replace_na(x, "neutral")) | ||
|
||
# Display `genes_cn_df` | ||
genes_cn_df %>% | ||
head(n = 15) | ||
|
||
# Join ATRT expression data with focal CN data | ||
atrt_expression_cn_df <- atrt_expression_df %>% | ||
dplyr::left_join(cn_df, by = "sample_id") | ||
dplyr::left_join(dplyr::select(genes_cn_df, | ||
-biospecimen_id, | ||
-Kids_First_Participant_ID), | ||
by = "sample_id") | ||
|
||
# Remove data we don't need | ||
rm(missing_cn_samples, cn_df, atrt_expression_df, genes_cn_df) | ||
``` | ||
|
||
### GSVA data | ||
|
@@ -317,6 +340,9 @@ scaled_gsva %>% | |
atrt_expression_cn_df <- atrt_expression_cn_df %>% | ||
dplyr::left_join(scaled_gsva, | ||
by = "sample_id") | ||
|
||
# Remove data we no longer need | ||
rm(gsva_filtered, gsva_mat, scaled_gsva) | ||
``` | ||
|
||
### Tumor mutation burden data | ||
|
@@ -329,44 +355,82 @@ tmb_df %>% | |
# Join ATRT expression, focal CN data and transposed ssGSEA data with tumor | ||
# mutation burden data | ||
atrt_expression_cn_tmb_df <- atrt_expression_cn_df %>% | ||
dplyr::left_join(tmb_df, by = "sample_id") | ||
dplyr::left_join(dplyr::select(tmb_df, | ||
-Tumor_Sample_Barcode, | ||
-Kids_First_Participant_ID), | ||
by = "sample_id") | ||
|
||
## TODO: Add a column to this data.frame denoting `chr22q` loss using the SV | ||
# data. | ||
# Remove data we no longer need | ||
rm(tmb_df, atrt_expression_cn_df) | ||
``` | ||
|
||
### GISTIC data | ||
|
||
```{r} | ||
gistic_df <- gistic_df %>% | ||
dplyr::select(-Kids_First_Biospecimen_ID) %>% | ||
dplyr::group_by(sample_id) %>% | ||
dplyr::mutate(chr_22q_loss = dplyr::case_when(`22q` == -1 ~ "Yes", | ||
jharenza marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TRUE ~ "No")) %>% | ||
dplyr::select(-`22q`) | ||
|
||
# Display `gistic_df` | ||
gistic_df %>% | ||
head(n = 15) | ||
|
||
# Join GISTIC data with the running final data.frame | ||
final_df <- atrt_expression_cn_tmb_df %>% | ||
dplyr::left_join(gistic_df, by = "sample_id") %>% | ||
dplyr::distinct() | ||
|
||
# Remove data we no longer need | ||
rm(gistic_df, atrt_expression_cn_tmb_df) | ||
``` | ||
|
||
# Save final table of results | ||
|
||
```{r} | ||
# Save final data.frame | ||
final_df <- atrt_expression_cn_tmb_df %>% | ||
dplyr::group_by(sample_id) %>% | ||
dplyr::mutate( | ||
Kids_First_Biospecimen_ID = paste(sort( | ||
unique(c(Kids_First_Biospecimen_ID.x, Kids_First_Biospecimen_ID.y, | ||
Tumor_Sample_Barcode)) | ||
), collapse = ", "), | ||
Kids_First_Participant_ID = paste(sort( | ||
unique(c(Kids_First_Participant_ID.x, Kids_First_Participant_ID.y)) | ||
), collapse = ", ") | ||
) %>% | ||
dplyr::select( | ||
-c( | ||
"Tumor_Sample_Barcode", | ||
"Kids_First_Biospecimen_ID.x", | ||
"Kids_First_Biospecimen_ID.y", | ||
"Kids_First_Participant_ID.x", | ||
"Kids_First_Participant_ID.y" | ||
) | ||
) %>% | ||
final_df <- final_df %>% | ||
dplyr::select( | ||
sample_id, | ||
Kids_First_Biospecimen_ID, | ||
Kids_First_Participant_ID, | ||
age_at_diagnosis_days, | ||
germline_sex_estimate, | ||
primary_site, | ||
location_summary, | ||
chr_22q_loss, | ||
SMARCB1_focal_status, | ||
TYR, | ||
MITF, | ||
DCT, | ||
VEGFA, | ||
DNAH11, | ||
SPEF1, | ||
POU3F4, | ||
POU3F2, | ||
PBX1, | ||
SMARCA4_focal_status, | ||
HALLMARK_NOTCH_SIGNALING, | ||
MYCN, | ||
GLI2, | ||
CDK6, | ||
ASCL1, | ||
ZBTB7A, | ||
MYBL2, | ||
MXI1, | ||
MEIS3, | ||
MEIS2, | ||
MAX, | ||
INSM1, | ||
FOXK1, | ||
HALLMARK_MYC_TARGETS_V1, | ||
HALLMARK_MYC_TARGETS_V2, | ||
dplyr::everything() | ||
) %>% | ||
dplyr::ungroup() | ||
) | ||
|
||
# Write final table to file | ||
readr::write_tsv(final_df, | ||
file.path(results_dir, "ATRT_molecular_subtypes.tsv")) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Filtering by
experimental_strategy == "RNA-Seq"
also effectively drops normal samples, but it also made it such that any ATRT samples missing RNA-seq data got dropped. We don't need to write the ATRT subset histologies file as a TSV because the CI histologies file is always the same file as a data release:OpenPBTA-analysis/analyses/create-subset-files/create_subset_files.sh
Line 41 in 2cbf526
The reason why we can include the full
pbta-histologies.tsv
for testing is two-fold: 1) it's pretty small and 2) having "extra" biospecimens/samples/participants in the histologies file helps test for brittle code.