AlexsLemonade · jaclyn-taroni · Jun 9, 2021 · May 25, 2021 · May 26, 2021 · May 26, 2021
diff --git a/analyses/README.md b/analyses/README.md
@@ -43,7 +43,7 @@ Note that _nearly all_ modules use the harmonized clinical data file (`pbta-hist
 | [`molecular-subtyping-pathology`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/molecular-subtyping-pathology) | `analyses/molecular-subtyping-CRANIO/results/CRANIO_molecular_subtype.tsv` <br> `analyses/molecular-subtyping-EPN/results/CRANIO_molecular_subtype.tsv` <br> `analyses/molecular-subtyping-MB/results/MB_molecular_subtype.tsv` <br> `analyses/molecular-subtyping-neurocytoma/results/neurocytoma_subtyping.tsv` <br> `analyses/molecular-subtyping-EWS/results/EWS_samples.tsv` <br> `analyses/molecular-subtyping-HGG/results/HGG_molecular_subtype.tsv` <br> `analyses/molecular-subtyping-LGAT/results/lgat_subtyping.tsv` <br> `analyses/molecular-subtyping-embryonal/results/embryonal_tumor_molecular_subtypes.tsv` <br> `analyses/molecular-subtyping-chordoma/results/chordoma_smarcb1_status.tsv` | Compile output from other molecular subtyping modules and incorporate pathology feedback [#645](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/645) | `results/compiled_molecular_subtyping_with_clinical_feedback.tsv` <br> `results/compiled_molecular_subtypes_with_clinical_pathology_feedback.tsv` <br> `results/compiled_molecular_subtypes_with_clinical_pathology_feedback_and_report_info.tsv`
 | [`mutational-signatures`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/mutational-signatures) | `pbta-snv-consensus-mutation.maf.tsv.gz` | Performs COSMIC and Alexandrov et al. mutational signature analysis using the consensus SNV data | N/A
 | [`mutect2-vs-strelka2`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/mutect2-vs-strelka2) | `pbta-snv-mutect2.vep.maf.gz` <br> `pbta-snv-strelka2.vep.maf.gz` | *Deprecated*; comparison of only two SNV callers, subsumed by `snv-callers` | N/A
-| [`oncoprint-landscape`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/oncoprint-landscape) | `pbta-snv-consensus-mutation.maf.tsv.gz` <br> `pbta-fusion-putative-oncogenic.tsv` <br> `analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz` <br> `independent-specimens.*` | Combines mutation, copy number, and fusion data into an OncoPrint plot ([#6](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/6)); will need to be updated as all data types are refined | N/A
+| [`oncoprint-landscape`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/oncoprint-landscape) | `pbta-snv-consensus-mutation.maf.tsv.gz` <br> `pbta-fusion-putative-oncogenic.tsv` <br> `analyses/focal-cn-file-preparation/results/consensus_seg_annotated_cn_autosomes.tsv.gz` <br> `analyses/focal-cn-file-preparation/results/consensus_seg_annotated_cn_x_and_y.tsv.gz` <br> `independent-specimens.*` | Combines mutation, copy number, and fusion data into an OncoPrint plot ([#6](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/6)); will need to be updated as all data types are refined | N/A
 | [`rna-seq-composition`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/rna-seq-composition) | `pbta-gene-expression-rsem-tpm.stranded.rds` <br> `pbta-histologies.tsv` <br> `pbta-mend-qc-results.tar.gz` <br> `pbta-mend-qc-manifest.tsv` <br> `pbta-star-log-manifest.tsv` <br> `pbta-star-log-final.tar.gz` | Analyzes the fraction of read types that comprise each RNA-Seq sample; flags samples with unusual composition| N/A
 | [`run-gistic`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/run-gistic) | `pbta-histologies.tsv` <br> `pbta-cnv-consensus.seg.gz` | Runs GISTIC 2.0 on SEG files | `pbta-cnv-consensus-gistic.zip` (included in data download)
 | [`sample-distribution-analysis`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/sample-distribution-analysis) | `pbta-histologies.tsv` | Produces plots and tables that illustrate the distribution of different histologies in the PBTA data | N/A

diff --git a/analyses/oncoprint-landscape/01-map-to-sample_id.R b/analyses/oncoprint-landscape/01-map-to-sample_id.R
@@ -34,10 +34,16 @@ option_list <- list(
     help = "file path to MAF file that contains SNV information",
   ),
   optparse::make_option(
-    c("--cnv_file"),
+    c("--cnv_autosomes_file"),
     type = "character",
     default = NULL,
-    help = "file path to file that contains CNV information"
+    help = "file path to file that contains autosome chromosome CNV information"
+  ),
+  optparse::make_option(
+    c("--cnv_xy_file"),
+    type = "character",
+    default = NULL,
+    help = "file path to file that contains X and Y chromosome CNV information"
   ),
   optparse::make_option(
     c("--fusion_file"),
@@ -93,7 +99,13 @@ cnv_output <- file.path(output_dir, paste0(opt$filename_lead, "_cnv.tsv"))
 histologies_df <- readr::read_tsv(opt$metadata_file, guess_max = 10000)
 
 maf_df <- readr::read_tsv(opt$maf_file)
-cnv_df <- readr::read_tsv(opt$cnv_file)
+opt$cnv_autosomes_file
+cnv_autosomes_df <- readr::read_tsv(opt$cnv_autosomes_file) %>%
+  left_join(select(histologies_df,c("Kids_First_Biospecimen_ID","germline_sex_estimate")),
+                   by=c("biospecimen_id"="Kids_First_Biospecimen_ID")
+            )
+cnv_xy_df <- readr::read_tsv(opt$cnv_xy_file) 
+cnv_df <- rbind(cnv_autosomes_df,cnv_xy_df)
 fusion_df <- readr::read_tsv(opt$fusion_file)
 
 #### Get rid of ambiguous and non-tumor samples --------------------------------
@@ -127,7 +139,7 @@ biospecimens_to_remove <- unique(c(ambiguous_biospecimens,
 maf_df <- maf_df %>%
   dplyr::filter(!(Tumor_Sample_Barcode %in% biospecimens_to_remove))
 cnv_df <- cnv_df %>%
-  dplyr::filter(!(Kids_First_Biospecimen_ID %in% biospecimens_to_remove))
+  dplyr::filter(!(biospecimen_id %in% biospecimens_to_remove))
 fusion_df <- fusion_df %>%
   dplyr::filter(!(Sample %in% biospecimens_to_remove))
 
@@ -143,7 +155,7 @@ if (!is.null(opt$independent_specimens)) {
   maf_df <- maf_df %>%
     filter(Tumor_Sample_Barcode %in% ind_biospecimen)
   cnv_df <- cnv_df %>%
-    filter(Kids_First_Biospecimen_ID %in% ind_biospecimen)
+    filter(biospecimen_id %in% ind_biospecimen)
 
   # for the RNA-seq samples, we need to map from the sample identifier
   # associated with the independent specimen and back to a biospecimen ID
@@ -283,12 +295,17 @@ cnv_df <- cnv_df %>%
   inner_join(select(histologies_df,
                     Kids_First_Biospecimen_ID,
                     sample_id),
-             by = "Kids_First_Biospecimen_ID") %>%
-  filter(status != "uncallable") %>%
+             by = c("biospecimen_id"="Kids_First_Biospecimen_ID")) %>%
   mutate(Tumor_Sample_Barcode =  sample_id) %>%
   rename(Variant_Classification = status,
 dplyr::mutate(Variant_Classification = dplyr::case_when(Variant_Classification == "loss" ~ "Del", 
 dplyr::mutate(Variant_Classification = dplyr::case_when(Variant_Classification == "loss" ~ "Del", 
-         Hugo_Symbol = region) %>%
-  select(Hugo_Symbol, Tumor_Sample_Barcode, Variant_Classification)
+         Hugo_Symbol = gene_symbol) %>%
+  select(Hugo_Symbol, Tumor_Sample_Barcode, Variant_Classification) %>%
+  # mutate loss and amplification to Del and Amp to fit Maftools format
+  dplyr::mutate(Variant_Classification = dplyr::case_when(Variant_Classification == "loss" ~ "Del",
+                                                          Variant_Classification == "amplification" ~ "Amp",
+                                                          TRUE ~ as.character(Variant_Classification))) %>%
+  # only keep Del and Amp calls
+  filter(Variant_Classification %in% c("Del", "Amp"))
 
 # Write to file
 readr::write_tsv(cnv_df, cnv_output)
diff --git a/analyses/oncoprint-landscape/02-plot-oncoprint.R b/analyses/oncoprint-landscape/02-plot-oncoprint.R
@@ -145,10 +145,7 @@ if (!opt$include_introns) {
 
 # Read in cnv file
 if (!is.null(opt$cnv_file)) {
-  cnv_df <- readr::read_tsv(opt$cnv_file) %>%
-    dplyr::mutate(Variant_Classification = dplyr::case_when(Variant_Classification == "loss" ~ "Del",
-                                                            Variant_Classification %in% c("gain", "amplification") ~ "Amp",
-                                                            TRUE ~ as.character(Variant_Classification)))
+  cnv_df <- readr::read_tsv(opt$cnv_file) 
 }
 
 # Read in fusion file and join
@@ -245,6 +242,7 @@ maf_object <- prepare_maf_object(
   fusion_df = fusion_df
 )
 
+
 #### Subset MAF Object (Optional)----------------------------------------------
 
 # Code here is specifically adapted from:

diff --git a/analyses/oncoprint-landscape/README.md b/analyses/oncoprint-landscape/README.md
@@ -16,28 +16,28 @@ bash run-oncoprint.sh
 
 ### Folder content
 
-* `00-map-to-sample_id.R` prepares MAF, focal CN (from the "most focal" output of the `focal-cn-file-preparation` module), and standardized fusion files for use with `01-plot-oncoprint.R`. 
+* `01-map-to-sample_id.R` prepares MAF, focal CN (from the "most focal" output of the `focal-cn-file-preparation` module), and standardized fusion files for use with `01-plot-oncoprint.R`. 
   * The `Tumor_Sample_Barcode` column in the output corresponds to the `sample_id` column in the histologies file
   * We remove ambiguous `sample_id` -- i.e., where there are more than two tumor biospecimens that map to the same sample identifier.
   * Filtering via an [independent specimen file](https://alexslemonade.github.io/OpenPBTA-manuscript/#selection-of-independent-samples) is optional, but highly recommended.
-* `01-plot-oncoprint.R` takes the files from above and optionally a file or set of files (to be concatenated) that will restrict the set of genes that are being plotted in an OncoPrint.
+* `02-plot-oncoprint.R` takes the files from above and optionally a file or set of files (to be concatenated) that will restrict the set of genes that are being plotted in an OncoPrint.
 	* Running this via `run-oncoprint.sh` will restrict plotting to a list of top mutated genes (generated in `analyses/interaction-plots/scripts/01-disease-specimen-lists.R`) and top genes with recurrent CNVs (generated in `analyses/focal-cn-file-preparation/06-find-recurrent-calls.Rmd`)
 
 
 ### Folder Structure
 
 ```
-├── 00-map-to-sample_id.R
-├── 01-plot-oncoprint.R
+├── 01-map-to-sample_id.R
+├── 02-plot-oncoprint.R
 ├── README.md
 ├── driver-lists
 │   ├── brain-goi-list-long.txt
 │   └── brain-goi-list-short.txt
 ├── plots
-│   ├── all_participants_primary-plus_goi_oncoprint.png
-│   ├── all_participants_primary-plus_oncoprint.png
-│   ├── all_participants_primary_only_goi_oncoprint.png
-│   └── all_participants_primary_only_oncoprint.png
+│   ├── primary-plus_goi_oncoprint.png
+│   ├── primary-plus_oncoprint.png
+│   ├── primary_only_goi_oncoprint.png
+│   └── primary_only_oncoprint.png
 ├── run-oncoprint.sh
 └── util
     └── oncoplot-functions.R

diff --git a/analyses/oncoprint-landscape/plots/primary-plus_embryonal_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_embryonal_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_embryonal_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_embryonal_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_ependymal_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_ependymal_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_ependymal_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_ependymal_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_hgat_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_hgat_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_hgat_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_hgat_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_lgat_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_lgat_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_lgat_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_lgat_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_other_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_other_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary-plus_other_oncoprint.png b/analyses/oncoprint-landscape/plots/primary-plus_other_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_embryonal_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_embryonal_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_embryonal_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_embryonal_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_ependymal_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_ependymal_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_ependymal_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_ependymal_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_hgat_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_hgat_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_hgat_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_hgat_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_lgat_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_lgat_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_lgat_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_lgat_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_other_goi_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_other_goi_oncoprint.png
diff --git a/analyses/oncoprint-landscape/plots/primary_only_other_oncoprint.png b/analyses/oncoprint-landscape/plots/primary_only_other_oncoprint.png
diff --git a/analyses/oncoprint-landscape/run-oncoprint.sh b/analyses/oncoprint-landscape/run-oncoprint.sh
@@ -22,7 +22,8 @@ intermediate_directory=../../scratch/oncoprint_files
 primary_filename="primary_only"
 primaryplus_filename="primary-plus"
 focal_directory=../focal-cn-file-preparation/results
-focal_cnv_file=${focal_directory}/consensus_seg_most_focal_cn_status.tsv.gz
+consensus_seg_autosomes_cnv_file=${focal_directory}/consensus_seg_annotated_cn_autosomes.tsv.gz
+consensus_seg_cnv_xy_cnv_file=${focal_directory}/consensus_seg_annotated_cn_x_and_y.tsv.gz
 oncoprint_data_directory=data
 
 #### Prep genes of interest lists ----------------------------------------------
@@ -37,7 +38,8 @@ Rscript --vanilla 00-prepare-goi-lists.R
 
 Rscript --vanilla 01-map-to-sample_id.R \
   --maf_file ${maf_consensus} \
-  --cnv_file ${focal_cnv_file} \
+  --cnv_autosomes_file ${consensus_seg_autosomes_cnv_file} \
+  --cnv_xy_file ${consensus_seg_cnv_xy_cnv_file} \
   --fusion_file ${fusion_file} \
   --metadata_file ${histologies_file} \
   --output_directory ${intermediate_directory} \
@@ -48,7 +50,8 @@ Rscript --vanilla 01-map-to-sample_id.R \
 
 Rscript --vanilla 01-map-to-sample_id.R \
   --maf_file ${maf_consensus} \
-  --cnv_file ${focal_cnv_file} \
+  --cnv_autosomes_file ${consensus_seg_autosomes_cnv_file} \
+  --cnv_xy_file ${consensus_seg_cnv_xy_cnv_file} \
   --fusion_file ${fusion_file} \
   --metadata_file ${histologies_file} \
   --output_directory ${intermediate_directory} \