AlexsLemonade · jaclyn-taroni · Mar 23, 2020 · Mar 9, 2020 · Mar 9, 2020 · Mar 11, 2020
diff --git a/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.Rmd b/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.Rmd
@@ -146,3 +146,31 @@ add_status_df %>%
 output_file <- file.path("..", "..", "scratch", "consensus_seg_with_status.tsv")
 write_tsv(add_status_df, output_file)
 ```
+
+### Prepare separate bed files for losses/gains for bedtools coverage function
+
+```{r}
+bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything())
+
+losses_bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything()) %>%
+  filter(status == "loss")
+
+gains_bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything()) %>%
+  filter(status == "gain")
-bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything())
-
-losses_bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything()) %>%
-  filter(status == "loss")
-
-gains_bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything()) %>%
-  filter(status == "gain")
+bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything()) %>%
+  arrange(chrom, loc.start, loc.end)
+
+losses_bed_status_df <- bed_status_df %>%
+  filter(status == "loss")
+
+gains_bed_status_df <- bed_status_df %>%
+  filter(status == "gain")
-bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything())
-
-losses_bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything()) %>%
-  filter(status == "loss")
-
-gains_bed_status_df <- add_status_df %>%
-  select(chrom, loc.start, loc.end, everything()) %>%
-  filter(status == "gain")
+bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything()) %>%
+  arrange(chrom, loc.start, loc.end)
+
+losses_bed_status_df <- bed_status_df %>%
+  filter(status == "loss")
+
+gains_bed_status_df <- bed_status_df %>%
+  filter(status == "gain")
+```
+
+### Write to file
+
+```{r}
+bed_output_file <- file.path("..", "..", "scratch", "consensus_seg_with_status.bed")
+loss_bed_output_file <- file.path("..", "..", "scratch", "consensus_seg_with_status_losses.bed")
+gain_bed_output_file <- file.path("..", "..", "scratch", "consensus_seg_with_status_gains.bed")
+
+write_tsv(bed_status_df, bed_output_file)
+write_tsv(losses_bed_status_df, loss_bed_output_file)
+write_tsv(gains_bed_status_df, gain_bed_output_file)
+```
+
diff --git a/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.nb.html b/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.nb.html
diff --git a/analyses/focal-cn-file-preparation/03-prepare-cn-file.R b/analyses/focal-cn-file-preparation/03-prepare-cn-file.R
@@ -11,7 +11,7 @@
 # This script is intended to be run via the command line.
 # This example assumes it is being run from the root of the repository.
 #
-# Rscript --vanilla analyses/oncoprint-landscape/03-prepare-cn-file.R \
+# Rscript --vanilla analyses/focal-cn-file-preparation/03-prepare-cn-file.R \
 #   --cnv_file data/pbta-cnv-controlfreec.tsv.gz \
 #   --gtf_file data/gencode.v27.primary_assembly.annotation.gtf.gz \
 #   --metadata data/pbta-histologies.tsv \

diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
@@ -18,74 +18,105 @@ cd "$script_directory" || exit
 
 scratch_dir=../../scratch
 data_dir=../../data
+results_dir=../../analyses/focal-cn-file-preparation/results
 histologies_file=${data_dir}/pbta-histologies.tsv
 gtf_file=${data_dir}/gencode.v27.primary_assembly.annotation.gtf.gz
 goi_file=../../analyses/oncoprint-landscape/driver-lists/brain-goi-list-long.txt
 independent_specimens_file=${data_dir}/independent-specimens.wgswxs.primary.tsv
+ucsc_bed_file=${results_dir}/ucsc_cytoband.bed
+consensus_bed_file=${scratch_dir}/consensus_seg_with_status.tsv
+loss_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_losses.tsv
+gain_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_gains.tsv
+callable_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_callable.tsv
-consensus_bed_file=${scratch_dir}/consensus_seg_with_status.tsv
-loss_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_losses.tsv
-gain_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_gains.tsv
-callable_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_callable.tsv
+consensus_bed_file=${scratch_dir}/consensus_seg_with_status.bed
+loss_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_losses.bed
+gain_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_gains.bed
+callable_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_callable.bed
-consensus_bed_file=${scratch_dir}/consensus_seg_with_status.tsv
-loss_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_losses.tsv
-gain_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_gains.tsv
-callable_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_callable.tsv
+consensus_bed_file=${scratch_dir}/consensus_seg_with_status.bed
+loss_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_losses.bed
+gain_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_gains.bed
+callable_intersect_with_cytoband_file=${scratch_dir}/intersect_with_cytoband_callable.bed
 
 # Prep the consensus SEG file data
 Rscript --vanilla -e "rmarkdown::render('02-add-ploidy-consensus.Rmd', clean = TRUE)"
 
-# Run annotation step for consensus file
-Rscript --vanilla 03-prepare-cn-file.R \
-  --cnv_file ${scratch_dir}/consensus_seg_with_status.tsv \
-  --gtf_file $gtf_file \
-  --metadata $histologies_file \
-  --filename_lead "consensus_seg_annotated_cn" \
-  --seg
+# Download and save UCSC cytoband file as bed file
+wget -O ${scratch_dir}/ucsc_cytoband.bed http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz
+
+# Use bedtools intersect to find the intersection between the UCSC file with
+# cytoband data and the `scratch/consensus_with_status.tsv` file prepared in
+# `02-add-ploidy-consensus.Rmd`
 
-libraryStrategies=("polya" "stranded")
-chromosomesType=("autosomes" "x_and_y")
-for strategy in ${libraryStrategies[@]}; do
+bedtools coverage \
+    -a ${scratch_dir}/ucsc_cytoband.bed \
+    -b ${scratch_dir}/consensus_seg_with_status_losses.bed \
+    -f 0.75 \
-    -f 0.75 \
+    -sorted \
-    -f 0.75 \
+    -sorted \
+    > $loss_intersect_with_cytoband_file
 
-  for chromosome_type in ${chromosomesType[@]}; do
+bedtools coverage \
+    -a ${scratch_dir}/ucsc_cytoband.bed \
+    -b ${scratch_dir}/consensus_seg_with_status_gains.bed \
+    -f 0.75 \
+    > $gain_intersect_with_cytoband_file
 
-    Rscript --vanilla rna-expression-validation.R \
-      --annotated_cnv_file results/consensus_seg_annotated_cn_${chromosome_type}.tsv.gz \
-      --expression_file ${data_dir}/pbta-gene-expression-rsem-fpkm-collapsed.${strategy}.rds \
-      --independent_specimens_file $independent_specimens_file \
-      --metadata $histologies_file \
-      --goi_list $goi_file \
-      --filename_lead "consensus_seg_annotated_cn"_${chromosome_type}_${strategy}
-  done
-done
+bedtools coverage \
+    -a ${scratch_dir}/ucsc_cytoband.bed \
+    -b ${scratch_dir}/consensus_seg_with_status.bed \
+    -f 0.75 \
+    > $callable_intersect_with_cytoband_file
 
-# if we want to process the CNV data from the original callers
-# (e.g., CNVkit, ControlFreeC)
-if [ "$RUN_ORIGINAL" -gt "0" ]; then
-
-  # Prep the CNVkit data
-  Rscript --vanilla -e "rmarkdown::render('01-add-ploidy-cnvkit.Rmd', clean = TRUE)"
-
-  # Run annotation step for CNVkit
-  Rscript --vanilla 03-prepare-cn-file.R \
-    --cnv_file ${scratch_dir}/cnvkit_with_status.tsv \
-    --gtf_file $gtf_file \
-    --metadata $histologies_file \
-    --filename_lead "cnvkit_annotated_cn" \
-    --seg
-
-  # Run annotation step for ControlFreeC
-  Rscript --vanilla 03-prepare-cn-file.R \
-    --cnv_file ${data_dir}/pbta-cnv-controlfreec.tsv.gz \
-    --gtf_file $gtf_file \
-    --metadata $histologies_file \
-    --filename_lead "controlfreec_annotated_cn" \
-    --controlfreec
-
-  filenameLead=("cnvkit_annotated_cn" "controlfreec_annotated_cn")
-  for filename in ${filenameLead[@]}; do
-    for strategy in ${libraryStrategies[@]}; do
-      for chromosome_type in ${chromosomesType[@]}; do
-        Rscript --vanilla rna-expression-validation.R \
-          --annotated_cnv_file results/${filename}_${chromosome_type}.tsv.gz \
-          --expression_file ${data_dir}/pbta-gene-expression-rsem-fpkm-collapsed.${strategy}.rds \
-          --independent_specimens_file $independent_specimens_file \
-          --metadata $histologies_file \
-          --goi_list $goi_file \
-          --filename_lead ${filename}_${chromosome_type}_${strategy}
-      done
-    done
-  done
-
-fi
+# # Run annotation step for consensus file
+# Rscript --vanilla 03-prepare-cn-file.R \
+#   --cnv_file ${scratch_dir}/consensus_seg_with_status.tsv \
+#   --gtf_file $gtf_file \
+#   --metadata $histologies_file \
+#   --filename_lead "consensus_seg_annotated_cn" \
+#   --seg
+# 
+# libraryStrategies=("polya" "stranded")
+# chromosomesType=("autosomes" "x_and_y")
+# for strategy in ${libraryStrategies[@]}; do
+# 
+#   for chromosome_type in ${chromosomesType[@]}; do
+# 
+#     Rscript --vanilla rna-expression-validation.R \
+#       --annotated_cnv_file results/consensus_seg_annotated_cn_${chromosome_type}.tsv.gz \
+#       --expression_file ${data_dir}/pbta-gene-expression-rsem-fpkm-collapsed.${strategy}.rds \
+#       --independent_specimens_file $independent_specimens_file \
+#       --metadata $histologies_file \
+#       --goi_list $goi_file \
+#       --filename_lead "consensus_seg_annotated_cn"_${chromosome_type}_${strategy}
+#   done
+# done
+# 
+# # if we want to process the CNV data from the original callers
+# # (e.g., CNVkit, ControlFreeC)
+# if [ "$RUN_ORIGINAL" -gt "0" ]; then
+# 
+#   # Prep the CNVkit data
+#   Rscript --vanilla -e "rmarkdown::render('01-add-ploidy-cnvkit.Rmd', clean = TRUE)"
+# 
+#   # Run annotation step for CNVkit
+#   Rscript --vanilla 03-prepare-cn-file.R \
+#     --cnv_file ${scratch_dir}/cnvkit_with_status.tsv \
+#     --gtf_file $gtf_file \
+#     --metadata $histologies_file \
+#     --filename_lead "cnvkit_annotated_cn" \
+#     --seg
+# 
+#   # Run annotation step for ControlFreeC
+#   Rscript --vanilla 03-prepare-cn-file.R \
+#     --cnv_file ${data_dir}/pbta-cnv-controlfreec.tsv.gz \
+#     --gtf_file $gtf_file \
+#     --metadata $histologies_file \
+#     --filename_lead "controlfreec_annotated_cn" \
+#     --controlfreec
+# 
+#   filenameLead=("cnvkit_annotated_cn" "controlfreec_annotated_cn")
+#   for filename in ${filenameLead[@]}; do
+#     for strategy in ${libraryStrategies[@]}; do
+#       for chromosome_type in ${chromosomesType[@]}; do
+#         Rscript --vanilla rna-expression-validation.R \
+#           --annotated_cnv_file results/${filename}_${chromosome_type}.tsv.gz \
+#           --expression_file ${data_dir}/pbta-gene-expression-rsem-fpkm-collapsed.${strategy}.rds \
+#           --independent_specimens_file $independent_specimens_file \
+#           --metadata $histologies_file \
+#           --goi_list $goi_file \
+#           --filename_lead ${filename}_${chromosome_type}_${strategy}
+#       done
+#     done
+#   done
+# 
+# fi