AlexsLemonade · jaclyn-taroni · Jan 9, 2020 · Jan 7, 2020 · Jan 8, 2020 · Jan 8, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -141,6 +141,10 @@ jobs:
        # - run:
           # name: SNV Caller VAF Cutoff Experiment
           # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/snv-callers/vaf_cutoff_experiment.Rmd', clean = TRUE)"
+
+      - run:
+          name: Fusion Summary
+          command: ./scripts/run_in_ci.sh bash "analyses/fusion-summary/run-new-analysis.sh"
 
 
 

diff --git a/analyses/README.md b/analyses/README.md
@@ -19,6 +19,7 @@ Note that _nearly all_ modules use the harmonized clinical data file (`pbta-hist
 | [`create-subset-files`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/create-subset-files) | All files | This module contains the code to create the subset files used in continuous integration | All subset files for continuous integration
 | [`focal-cn-file-preparation`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/focal-cn-file-preparation) | `pbta-cnv-cnvkit.seg.gz`, `pbta-cnv-controlfreec.tsv.gz`, `pbta-gene-expression-rsem-fpkm.polya.rds`, `pbta-gene-expression-rsem-fpkm.stranded.rds` | Maps from copy number variant caller segments to gene identifiers; will eventually be updated to use consensus copy number calls ([#186](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/186))| `cnvkit_annotated_cn_autosomes.tsv.gz`, `cnvkit_annotated_cn_x_and_y.tsv.gz`, `controlfreec_annotated_cn_autosomes.tsv.gz`, `controlfreec_annotated_cn_x_and_y.tsv.gz`
 | [`fusion_filtering`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/fusion_filtering) | `pbta-fusion-arriba.tsv.gz`, `pbta-fusion-starfusion.tsv.gz` | Standardizes, filters, and prioritizes fusion calls | `pbta-fusion-putative-oncogenic.tsv`, `pbta-fusion-recurrent-fusion-byhistology.tsv`, `pbta-fusion-recurrent-fusion-bysample.tsv` (included in data download) 
+| [`fusion-summary`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/fusion-summary)| `pbta-histologies.tsv`,`pbta-fusion-putative-oncogenic.tsv` | Generate summary tables from fusion files ([#398](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/398)) | N/A
 | [`gene-set-enrichment-analysis`](https://github.com/sjspielman/OpenPBTA-analysis/tree/gene_set_analysis/analyses/gene-set-enrichment-analysis) | `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` and `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds`  | *In progress*. Updated gene set enrichment analysis with appropriate RNA-seq expression data | `results/gsva_scores_stranded.tsv` and `results/gsva_scores_polya.tsv` for stranded, polya expression data respectively  
 | [`independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | `pbta-histologies.tsv` | Generates independent specimen lists for WGS/WXS samples | `independent-specimens.wgs.primary.tsv`, `independent-specimens.wgs.primary-plus.tsv`, `independent-specimens.wgswxs.primary.tsv`, `independent-specimens.wgswxs.primary-plus.tsv` (included in data download)
 | [`interaction-plots`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/interaction-plots) | `independent-specimens.wgs.primary-plus.tsv`, `pbta-snv-consensus-mutation.maf.tsv.gz` | Creates interaction plots for mutation mutual exclusivity/co-occurrence [#13](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/13); may be updated to include other data types (e.g., fusions) | N/A

diff --git a/analyses/fusion-summary/01-fusion-summary.R b/analyses/fusion-summary/01-fusion-summary.R
@@ -0,0 +1,141 @@
+#' @description  Generate fusion files specifically for consumption by molecular subtyping analyses
+#' @author Daniel Miller <millerd15@@email.chop.edu> (D3b)
+#' @note Date: January 2020
+
+suppressPackageStartupMessages(library("dplyr"))
+suppressPackageStartupMessages(library("data.table"))
+suppressPackageStartupMessages(library("optparse"))
+
+#' **Filters**
+#' 
+#' *Fusions Filters*
+#' 1: Exact match a list of fusions common in Ependymoma tumors
+ependFuses = c(
+  "C11orf95--MAML2",
+  "C11orf95--RELA",
+  "C11orf95--YAP1", 
+  "LTBP3--RELA",
+  "PTEN--TAS2R1",
+  "YAP1--FAM118B",
+  "YAP1--MAMLD1",
+  "YAP1--MAMLD2"
+)
+ependGenes = c(
+  "RELA"
+)
+#' 2: Exact match a list of fusions common in Embryonal tumors
+#' as well as fusions containing a particular gene with any other gene
+embryFuses = c(
+  "CIC--NUTM1",
+  "MN1--BEND2",
+  "MN1--CXXC5"
+)
+embryGenes = c(
+  "FOXR2",
+  "MN1",
+  "TTYH1"
+)
+
+#' Generate filtered fusion frame
+#' @param df Unfiltered fusion data frame
+#' @param bioid List of biospecimen IDs
+#' @param fuses List of explicit fusion names
+#' @param genes List of gene names
+#' @return the filtered fusion data frame
+filterFusion <- function(df, bioid, fuses, genes) {
+  if (!missing(bioid)) {
+    df <- filter(df, Sample %in% bioid)
+  }
+  if (!missing(fuses) & !missing(genes)) {
+    df <- filter(df, FusionName %in% fuses | 
+                   Gene1A %in% genes |
+                   Gene2A %in% genes |
+                   Gene1B %in% genes |
+                   Gene2B %in% genes)
+  } else if (!missing(fuses)) {
+    df <- filter(df, FusionName %in% fuses)
+  } else if (!missing(genes)) {
+    df <- filter(df,     
+                 Gene1A %in% genes |
+                   Gene2A %in% genes |
+                   Gene1B %in% genes |
+                   Gene2B %in% genes)
+  }
+  return(df)
+}
+
+#' Creates a TSV of the filtered fusion sheet
+#' @param df The filtered fusion data frame
+#' @param fuses List of explicit fusion names
+#' @param outputPath Path of the output file
+#' @return Writes a TSV to the output path
+generateOutput <- function(df, fuses, outputPath) {
+  # create a list to be used to reduce the table to its minimal set
+  # minimal set is the explicit fusions + any other non-specific fusions in the table
+  lvls = unique(c(fuses, sort(as.character(unique(df$FusionName)))))
+  df$FusionName = factor(df$FusionName, levels=lvls)
+  # convert the data frame to a table
+  tbl = table(df$Sample,df$FusionName)
+  # convert back to a matrix
+  mtx = as.data.frame.matrix(tbl)
+  # convert the rownames into the first row
+  mtx = setDT(mtx,keep.rownames=TRUE)
+  # rename the column of the rownames
+  colnames(mtx)[colnames(mtx) == 'rn'] <- 'Kids_First_Biospecimen_ID'
+  write.table(mtx, file=outputPath, sep="\t", quote=FALSE, row.names=FALSE)
+}
+
+#' Set up the options
+optionList <- list(
+  make_option(
+    opt_str = c("-d","--demographic_file"), type = "character",
+    default = NULL, help = "Path to the demographic file."
+  ),
+  make_option(
+    opt_str = c("-f","--fusions_file"), type = "character",
+    default = NULL, help = "Path to the fusions file."
+  ),
+  make_option(
+    opt_str = c("-o", "--output_dir"), type = "character",
+    default = "pbta-consolidated-fusions", help = "Output directory for output files."
+  )
+)
+
+#' Parse the options
+opt <- parse_args(OptionParser(option_list = optionList))
+
+#' Check the output directory
+if (!dir.exists(opt$output_dir)) {
+  dir.create(opt$output_dir)
+}
+
+#' Check that the files exist
+if (!file.exists(opt$demographic_file)) {
+  demo_missing <- paste("Error:", opt$demographic_file, "does not exist")
+  if (!file.exists(opt$fusions_file)) {
+    stop(paste(demo_missing, "\nError:", opt$fusions_file, "does not exist"))
+  } else
+    stop(demo_missing)
+} else if (!file.exists(opt$fusions_file)) {
+  stop(paste("Error:", opt$fusions_file, "does not exist"))
+}
+#' Load the files
+demo <- read.csv(opt$demographic_file, sep="\t")
+fuse <- read.csv(opt$fusions_file, sep="\t")
+
+#' Filter the fusion files for your two populations
+allFuseEpend <- filterFusion(df = fuse,
+                          fuses = ependFuses,
+                          genes = ependGenes)
+allFuseEmbry <- filterFusion(df = fuse,
+                          fuses = embryFuses,
+                          genes = embryGenes)
+#' Write the fusion frames to file
+generateOutput(df  = allFuseEpend,
+               fuses = ependFuses,
+               outputPath = file.path(
+                 opt$output_dir,"fusion_summary_ependymoma_foi.tsv"))
+generateOutput(df = allFuseEmbry,
+               fuses = embryFuses,
+               outputPath = file.path(
+                 opt$output_dir,"fusion_summary_embryonal_foi.tsv"))
diff --git a/analyses/fusion-summary/README.md b/analyses/fusion-summary/README.md
@@ -0,0 +1,19 @@
+# Fusion Summary
+
+This module generates summary files for fusions of interest present in biospecimens taken from:
+1. Ependymoma tumors
+2. Embryonal tumors not from ATRT or MB
+
+To genereate the tables simply run:
+```
+./run-new-analysis.sh
+```
+
+## General Use
+
+The program generates files that contain information about the presence or absence of specific fusions or genes participating in fusions.
+These can be potentially used for further downstream molecular subtyping analyses.
+
+### Reference
+
+https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/398