Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Add chr22q loss variable to ATRT molecular subtyping #414

Merged
merged 21 commits into from
Jan 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d3d27d5
Add `chr22q` loss variable to ATRT subtypes data.frame
cbethell Jan 8, 2020
728b6fa
Merge branch 'master' into add-chr-22-atrt
cbethell Jan 8, 2020
5750c00
@jaclyn-taroni suggestion to rearrange columns
cbethell Jan 8, 2020
6425ddb
Update `README` to reflect this PR's changes
cbethell Jan 8, 2020
c3da786
Change `NA` values to `neutral`
cbethell Jan 8, 2020
8bef332
Attempt to correctly assign `neutral` to `WGS` samples in focal status
cbethell Jan 8, 2020
76c5c1a
Merge branch 'master' into add-chr-22-atrt
cbethell Jan 9, 2020
f4e9a5c
Don't subset to the RNA-seq samples only
jaclyn-taroni Jan 9, 2020
fedd788
Remove duplicated columns in final table
cbethell Jan 9, 2020
8411796
Tumor samples only just in case
jaclyn-taroni Jan 9, 2020
57768d3
Handle 'missing' copy number samples
jaclyn-taroni Jan 9, 2020
1ac4e91
Clean up a few things
jaclyn-taroni Jan 9, 2020
846d4f2
Use full histologies file
jaclyn-taroni Jan 9, 2020
3a46c7e
We recovered samples w/o RNA-seq data
jaclyn-taroni Jan 9, 2020
dc4c62e
Rerun module
jaclyn-taroni Jan 9, 2020
7f01263
Use different data.frame, save a few lines
jaclyn-taroni Jan 9, 2020
b7efffa
Merge branch 'master' into add-chr-22-atrt
jaclyn-taroni Jan 9, 2020
c3a5447
Add `data.table = FALSE` argument and rerun subset script
cbethell Jan 9, 2020
40902a8
Update README.md
jaclyn-taroni Jan 9, 2020
504e89e
Merge branch 'master' into add-chr-22-atrt
jaclyn-taroni Jan 9, 2020
5c65f37
Merge branch 'master' into add-chr-22-atrt
jaclyn-taroni Jan 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions analyses/molecular-subtyping-ATRT/00-subset-files-for-ATRT.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ cn_df <- readr::read_tsv(
"analyses",
"focal-cn-file-preparation",
"results",
"controlfreec_annotated_cn_autosomes.tsv.gz"
"cnvkit_annotated_cn_autosomes.tsv.gz"
)
)

Expand All @@ -81,26 +81,31 @@ tmb_df <-
"data",
"pbta-snv-consensus-mutation-tmb-all.tsv"))

## TODO: Read in the SV data/GISTIC output to evaluate the chr22q loss
# associated with SMARB1 deletions
# Read in GISTIC `broad_values_by_arm.txt` file
gistic_df <-
data.table::fread(unzip(
file.path(root_dir, "data", "pbta-cnv-cnvkit-gistic.zip"),
files = file.path(
"2019-12-10-gistic-results-cnvkit",
"broad_values_by_arm.txt"
),
exdir = file.path(root_dir, "scratch")
), data.table = FALSE)

#### Filter metadata -----------------------------------------------------------

# Filter metadata for `ATRT` and define `location_summary` based on values in
# `primary_site`
atrt_df <- metadata %>%
dplyr::filter(short_histology == "ATRT",
experimental_strategy == "RNA-Seq")

# Write to file
readr::write_tsv(atrt_df, file.path(results_dir, "atrt_histologies.tsv"))
sample_type == "Tumor",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filtering by experimental_strategy == "RNA-Seq" also effectively drops normal samples, but it also made it such that any ATRT samples missing RNA-seq data got dropped. We don't need to write the ATRT subset histologies file as a TSV because the CI histologies file is always the same file as a data release:

cp $FULL_DIRECTORY/pbta-histologies.tsv $SUBSET_DIRECTORY

The reason why we can include the full pbta-histologies.tsv for testing is two-fold: 1) it's pretty small and 2) having "extra" biospecimens/samples/participants in the histologies file helps test for brittle code.

composition == "Solid Tissue")

#### Filter expression data ----------------------------------------------------

# Filter to ATRT samples only -- we can use atrt_df because it is subset to
# RNA-seq samples
stranded_expression <- stranded_expression %>%
dplyr::select(atrt_df$Kids_First_Biospecimen_ID)
dplyr::select(intersect(atrt_df$Kids_First_Biospecimen_ID,
colnames(stranded_expression)))

# Log2 transformation
norm_expression <- log2(stranded_expression + 1)
Expand Down Expand Up @@ -142,8 +147,25 @@ readr::write_tsv(filtered_gsva_scores, file.path(results_dir, "atrt_gsva.tsv"))
tmb_df <- tmb_df %>%
dplyr::select(Tumor_Sample_Barcode, tmb) %>%
dplyr::left_join(select_metadata,
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
dplyr::filter(sample_id %in% atrt_df$sample_id)

# Write to file
readr::write_tsv(tmb_df, file.path(results_dir, "atrt_tmb.tsv"))

#### Filter GISTIC data --------------------------------------------------------

gistic_df <- gistic_df %>%
tibble::column_to_rownames("Chromosome Arm") %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% atrt_df$Kids_First_Biospecimen_ID) %>%
dplyr::left_join(select_metadata, by = "Kids_First_Biospecimen_ID") %>%
dplyr::select(sample_id,
Kids_First_Biospecimen_ID,
`22q`) # Select only the chromosome arm we are interested in

# Write to file
readr::write_tsv(gistic_df,
file.path(results_dir, "atrt_gistic_broad_values.tsv"))
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,18 @@ if (!dir.exists(results_dir)) {
dir.create(results_dir)
}

# Read in non-subsetted metadata
metadata <- readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv"))

# Read in ATRT subset metadata
metadata <-
readr::read_tsv(file.path(input_dir, "atrt_histologies.tsv"))
subset_metadata <- metadata %>%
dplyr::filter(short_histology == "ATRT",
sample_type == "Tumor",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Getting rid of the normal and cell line data here.

composition == "Solid Tissue")

# Select wanted columns in metadata for merging and assign to a new object
select_metadata <- metadata %>%
dplyr::select(sample_id,
Kids_First_Participant_ID,
Kids_First_Biospecimen_ID)

# Read in ATRT subset GSVA pathway scores
Expand Down Expand Up @@ -87,6 +91,11 @@ cn_df <- readr::read_tsv(
tmb_df <-
data.table::fread(file.path(input_dir,
"atrt_tmb.tsv"))

# Read in ATRT subset GISTIC data
gistic_df <-
readr::read_tsv(file.path(input_dir, "atrt_gistic_broad_values.tsv"))

```

## Custom Function
Expand Down Expand Up @@ -151,15 +160,14 @@ infratentorial <-
"Other locations NOS;Spinal Cord- Lumbar/Thecal Sac;Spinal Cord- Thoracic;Ventricles"
)

metadata <- metadata %>%
collapsed_metadata <- subset_metadata %>%
dplyr::mutate(
location_summary = dplyr::case_when(
primary_site %in% infratentorial ~ "infratentorial",
primary_site %in% supratentorial ~ "supratentorial",
TRUE ~ "NA"
)
) %>%
dplyr::group_by(sample_id) %>%
dplyr::select(
sample_id,
Kids_First_Biospecimen_ID,
Expand All @@ -168,14 +176,19 @@ metadata <- metadata %>%
age_at_diagnosis_days,
germline_sex_estimate,
primary_site
)
) %>%
dplyr::group_by(
sample_id,
Kids_First_Participant_ID
) %>%
dplyr::summarize_all(function(x) paste(sort(unique(x)), collapse = ", "))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cbethell if we do this collapse biospecimen IDs step up front, we can drop the biospecimen ID columns as we go since we join on sample_id. This means we don't have to collapse at the end and run the risk of having a bunch of duplicated columns.


# Display metadata subsetted for ATRT samples
metadata %>%
collapsed_metadata %>%
head(n = 15)
```

## Filter and join RNA expression, CN, TMB, and ssGSEA data
## Filter and join RNA expression, CN, TMB, ssGSVA and GISTIC data

### RNA expression data

Expand Down Expand Up @@ -241,50 +254,60 @@ long_stranded_expression <- scale(t(filtered_expression),
# Merge metadata with expression data
expression_metadata <- long_stranded_expression %>%
as.data.frame() %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
dplyr::left_join(select_metadata, by = "Kids_First_Biospecimen_ID")
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
dplyr::left_join(select_metadata, by = "Kids_First_Biospecimen_ID") %>%
dplyr::select(-Kids_First_Biospecimen_ID)

# Display `expression_metadata`
expression_metadata %>%
head(n = 15)

# Join expression data with metadata filtered for `ATRT`
atrt_expression_df <- metadata %>%
atrt_expression_df <- collapsed_metadata %>%
dplyr::left_join(expression_metadata,
by = "sample_id")

# Remove data we no longer need
rm(filtered_expression, long_stranded_expression, expression_metadata)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing from the workspace as we go makes it such that we don't have data.frame we no longer need in memory.

```

### CN data

```{r}
# Filter focal CN data for SMARCB1 and SMARCA4 status
cn_df <- cn_df %>%
# Filter focal CN data for SMARCB1 and SMARCA4 status
genes_cn_df <- cn_df %>%
dplyr::filter(gene_symbol %in% c("SMARCB1", "SMARCA4")) %>%
dplyr::mutate(
SMARCB1_focal_status = dplyr::case_when(gene_symbol == "SMARCB1" ~ status,
TRUE ~ "neutral"),
SMARCA4_focal_status = dplyr::case_when(gene_symbol == "SMARCA4" ~ status,
TRUE ~ "neutral")
) %>%
dplyr::select(-c("status", "gene_symbol")) %>%
dplyr::distinct() %>%
dplyr::group_by(sample_id) %>%
dplyr::summarise(
SMARCB1_focal_status = paste(sort(unique(
SMARCB1_focal_status
)), collapse = ", "),
SMARCA4_focal_status = paste(sort(unique(
SMARCA4_focal_status
)), collapse = ", ")
)

# Display `cn_metadata`
cn_df %>%
tidyr::spread(gene_symbol, status, fill = "neutral") %>%
dplyr::rename(SMARCB1_focal_status = SMARCB1,
SMARCA4_focal_status = SMARCA4)

# add in the samples that did not have any copy number changes for either
jharenza marked this conversation as resolved.
Show resolved Hide resolved
# of these genes
missing_cn_samples <- setdiff(unique(cn_df$sample_id), genes_cn_df$sample_id)
genes_cn_df <- subset_metadata %>%
dplyr::filter(sample_id %in% missing_cn_samples) %>%
dplyr::select(sample_id,
Kids_First_Participant_ID,
Kids_First_Biospecimen_ID) %>%
dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
# bind rows fills with NAs -- if something is NA, there is no evidence for
# a copy number alteration in that gene and we will call it neutral
dplyr::bind_rows(genes_cn_df) %>%
dplyr::mutate_all(function(x) tidyr::replace_na(x, "neutral"))

# Display `genes_cn_df`
genes_cn_df %>%
head(n = 15)

# Join ATRT expression data with focal CN data
atrt_expression_cn_df <- atrt_expression_df %>%
dplyr::left_join(cn_df, by = "sample_id")
dplyr::left_join(dplyr::select(genes_cn_df,
-biospecimen_id,
-Kids_First_Participant_ID),
by = "sample_id")

# Remove data we don't need
rm(missing_cn_samples, cn_df, atrt_expression_df, genes_cn_df)
```

### GSVA data
Expand Down Expand Up @@ -317,6 +340,9 @@ scaled_gsva %>%
atrt_expression_cn_df <- atrt_expression_cn_df %>%
dplyr::left_join(scaled_gsva,
by = "sample_id")

# Remove data we no longer need
rm(gsva_filtered, gsva_mat, scaled_gsva)
```

### Tumor mutation burden data
Expand All @@ -329,44 +355,82 @@ tmb_df %>%
# Join ATRT expression, focal CN data and transposed ssGSEA data with tumor
# mutation burden data
atrt_expression_cn_tmb_df <- atrt_expression_cn_df %>%
dplyr::left_join(tmb_df, by = "sample_id")
dplyr::left_join(dplyr::select(tmb_df,
-Tumor_Sample_Barcode,
-Kids_First_Participant_ID),
by = "sample_id")

## TODO: Add a column to this data.frame denoting `chr22q` loss using the SV
# data.
# Remove data we no longer need
rm(tmb_df, atrt_expression_cn_df)
```

### GISTIC data

```{r}
gistic_df <- gistic_df %>%
dplyr::select(-Kids_First_Biospecimen_ID) %>%
dplyr::group_by(sample_id) %>%
dplyr::mutate(chr_22q_loss = dplyr::case_when(`22q` == -1 ~ "Yes",
jharenza marked this conversation as resolved.
Show resolved Hide resolved
TRUE ~ "No")) %>%
dplyr::select(-`22q`)

# Display `gistic_df`
gistic_df %>%
head(n = 15)

# Join GISTIC data with the running final data.frame
final_df <- atrt_expression_cn_tmb_df %>%
dplyr::left_join(gistic_df, by = "sample_id") %>%
dplyr::distinct()

# Remove data we no longer need
rm(gistic_df, atrt_expression_cn_tmb_df)
```

# Save final table of results

```{r}
# Save final data.frame
final_df <- atrt_expression_cn_tmb_df %>%
dplyr::group_by(sample_id) %>%
dplyr::mutate(
Kids_First_Biospecimen_ID = paste(sort(
unique(c(Kids_First_Biospecimen_ID.x, Kids_First_Biospecimen_ID.y,
Tumor_Sample_Barcode))
), collapse = ", "),
Kids_First_Participant_ID = paste(sort(
unique(c(Kids_First_Participant_ID.x, Kids_First_Participant_ID.y))
), collapse = ", ")
) %>%
dplyr::select(
-c(
"Tumor_Sample_Barcode",
"Kids_First_Biospecimen_ID.x",
"Kids_First_Biospecimen_ID.y",
"Kids_First_Participant_ID.x",
"Kids_First_Participant_ID.y"
)
) %>%
final_df <- final_df %>%
dplyr::select(
sample_id,
Kids_First_Biospecimen_ID,
Kids_First_Participant_ID,
age_at_diagnosis_days,
germline_sex_estimate,
primary_site,
location_summary,
chr_22q_loss,
SMARCB1_focal_status,
TYR,
MITF,
DCT,
VEGFA,
DNAH11,
SPEF1,
POU3F4,
POU3F2,
PBX1,
SMARCA4_focal_status,
HALLMARK_NOTCH_SIGNALING,
MYCN,
GLI2,
CDK6,
ASCL1,
ZBTB7A,
MYBL2,
MXI1,
MEIS3,
MEIS2,
MAX,
INSM1,
FOXK1,
HALLMARK_MYC_TARGETS_V1,
HALLMARK_MYC_TARGETS_V2,
dplyr::everything()
) %>%
dplyr::ungroup()
)

# Write final table to file
readr::write_tsv(final_df,
file.path(results_dir, "ATRT_molecular_subtypes.tsv"))

Expand Down
Loading