-
Notifications
You must be signed in to change notification settings - Fork 67
Addition of R script to compare CNV caller output #142
Changes from 6 commits
94fd9d1
486d6aa
2e132c8
6493375
d00858d
7f5761f
80fe382
c2b2721
7955075
e3c00a0
d913351
8816983
ea25caa
6222c95
f9f0d76
3cf5643
67869be
cf889d5
ce84bd2
a5624f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
# Plot and compare detected CNV aberrations given CNVkit and Control-FREEC | ||
# output | ||
# | ||
# Chante Bethell for CCDL 2019 | ||
# | ||
# Usage: | ||
# This script is intended to be run via the command line from the top directory | ||
# of the repository as follows: | ||
# | ||
# Rscript analyses/cnv-comparison/01-cnv-comparison-plotting.R | ||
# | ||
|
||
#### Install packages ---------------------------------------------------------- | ||
if (!("GenVisR" %in% installed.packages())) { | ||
install.packages("BiocManager") | ||
BiocManager::install("GenVisR") | ||
} | ||
|
||
if (!("cowplot" %in% installed.packages())) { | ||
install.packages("BiocManager") | ||
BiocManager::install("cowplot") | ||
} | ||
|
||
##### Set up functions --------------------------------------------------------- | ||
|
||
# Magrittr pipe | ||
`%>%` <- dplyr::`%>%` | ||
|
||
# Detect the ".git" folder -- this will in the project root directory. | ||
# Use this as the root directory to ensure proper sourcing of functions no | ||
# matter where this is called from | ||
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) | ||
|
||
# Source custom functions script | ||
source(file.path(root_dir, "analyses", "cnv-comparison", "util", | ||
"cnv-comparison-functions.R")) | ||
|
||
#### Set up file paths --------------------------------------------------------- | ||
|
||
input_directory <- file.path(root_dir, "data") | ||
|
||
# Create the output directory if it does not exist | ||
output_directory <- file.path(root_dir, "analyses", "cnv-comparison", "plots") | ||
|
||
if (!dir.exists(output_directory)) { | ||
dir.create(output_directory, recursive = TRUE) | ||
} | ||
|
||
#### Read in data --------------------- ---------------------------------------- | ||
|
||
# Read in metadata | ||
metadata <- | ||
readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv")) | ||
|
||
# Read in cnvkit data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since, you've made your functions, you could put all your datasets in a list and then call the function from there so you only have to say it once.
Now for the rest of your functions, you would only have to run it in one line:
Let me know if this makes sense, if not I can expound on this idea. |
||
cnvkit <- read_in_cnv(input_directory, "pbta-cnv-cnvkit.seg.gz") | ||
|
||
# Read in cnvkit subset data | ||
cnvkit_subset <- | ||
read_in_cnv(input_directory, "testing/pbta-cnv-cnvkit.seg.gz") | ||
|
||
# Read in controlfreec data | ||
controlfreec <- | ||
read_in_cnv(input_directory, "pbta-cnv-controlfreec.seg.gz") | ||
|
||
# Read in controlfreec subset data | ||
controlfreec_subset <- | ||
read_in_cnv(input_directory, "testing/pbta-cnv-controlfreec.seg.gz") | ||
|
||
#### Filter data --------------------------------------------------------------- | ||
|
||
# Filter the cnvkit data by cutoff segmean in preparation for plotting | ||
cnvkit_format <- filter_segmean(cnvkit) | ||
|
||
# Filter the cnvkit subset data by cutoff segmean in preparation for plotting | ||
cnvkit_subset <- filter_segmean(cnvkit_subset) | ||
|
||
# Filter the controlfreec data by cutoff segmean in preparation for plotting | ||
controlfreec_format <- filter_segmean(controlfreec) | ||
|
||
# Filter the controlfreec subset data for cutoff in preparation for plotting | ||
controlfreec_subset <- filter_segmean(controlfreec_subset) | ||
|
||
#### Plot frequency and proportion using GenVisR ------------------------------- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could do the same lapply strategy here. But then twice, once for |
||
|
||
# Run `plot_cnFreq` for cnvkit (has to be run with subset because | ||
# original dataset is too large for function) | ||
cnvkit_prop_plot <- | ||
plot_cnFreq(cnvkit_format, "proportion", "CNVkit proportion") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are always planning on running both proportion and frequency, you could make an argument in your function that runs both automatically so you don't have to repeat yourself. |
||
cnvkit_freq_plot <- | ||
plot_cnFreq(cnvkit_format, "frequency", "CNVkit frequency") | ||
|
||
# Run `plot_cnFreq` for controlfreec | ||
controlfreec_prop_plot <- | ||
plot_cnFreq(controlfreec_format, | ||
"proportion", | ||
"Control-FREEC proportion") | ||
controlfreec_freq_plot <- | ||
plot_cnFreq(controlfreec_format, | ||
"frequency", | ||
"Control-FREEC frequency") | ||
|
||
# Plot cowplot of frequency plots and save | ||
plot_cowplot( | ||
cnvkit_freq_plot, | ||
controlfreec_freq_plot, | ||
output_directory, | ||
"compare_cnv_output_frequency.pdf" | ||
) | ||
|
||
# Plot cowplot of proportion plots and save | ||
plot_cowplot( | ||
cnvkit_prop_plot, | ||
controlfreec_prop_plot, | ||
output_directory, | ||
"compare_cnv_output_proportion.pdf" | ||
) | ||
|
||
#### Plot boxplots using ggplot2 ----------------------------------------------- | ||
|
||
# Run `plot_boxplot` on cnvkit | ||
cnvkit_boxplot <- plot_boxplot(cnvkit_format, "CNVkit boxplot") | ||
|
||
# Run `plot_boxplot` on controlfreec | ||
controlfreec_boxplot <- | ||
plot_boxplot(controlfreec_format, "Control-FREEC boxplot") | ||
|
||
# Save the plot combining the cnvkit and controlfreec boxplots | ||
plot_cowplot( | ||
cnvkit_boxplot, | ||
controlfreec_boxplot, | ||
output_directory, | ||
"compare_cnv_output_boxplot.pdf" | ||
) | ||
|
||
##### Plot barplots using ggplot2 ---------------------------------------------- | ||
|
||
# Run `plot_histology_barplot` on cnvkit | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This section could also be |
||
cnvkit_annotated_barplot_histology <- | ||
plot_histology_barplot(cnvkit_format, metadata, "CNVkit") | ||
|
||
# Run `plot_histology_barplot` on controlfreec | ||
controlfreec_annotated_barplot_histology <- | ||
plot_histology_barplot(controlfreec_format, | ||
metadata, | ||
"Control-FREEC") | ||
|
||
# Run `plot_aberration_barplot` on cnvkit | ||
cnvkit_annotated_barplot_aberration <- | ||
plot_aberration_barplot(cnvkit_format, "CNVkit") | ||
|
||
# Run `plot_abberration_barplot` on controlfreec | ||
controlfreec_annotated_barplot_aberration <- | ||
plot_aberration_barplot(controlfreec_format, | ||
"Control-FREEC") | ||
|
||
# Save the plot combining the cnvkit and controlfreec barplots | ||
plot_cowplot( | ||
cnvkit_annotated_barplot_histology, | ||
controlfreec_annotated_barplot_histology, | ||
output_directory, | ||
"compare_cnv_output_barplot_histology.pdf" | ||
) | ||
plot_cowplot( | ||
cnvkit_annotated_barplot_aberration, | ||
controlfreec_annotated_barplot_aberration, | ||
output_directory, | ||
"compare_cnv_output_barplot_aberration.pdf" | ||
) |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,200 @@ | ||||||||
# This script defines filtering and plotting functions to be sourced in | ||||||||
# `01-cnv-comparison-plotting.R` | ||||||||
# | ||||||||
# Chante Bethell for CCDL 2019 | ||||||||
# | ||||||||
# Usage: | ||||||||
# This script is intended to be run via the command line from the top directory | ||||||||
# of the repository as follows: | ||||||||
# | ||||||||
# Rscript analyses/cnv-comparison/util/cnv-comparison-functions.R | ||||||||
|
||||||||
read_in_cnv <- function(input_directory, file_path){ | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the motivation for this being its own function rather than just using these steps as is? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not saying it shouldn't be a function necessarily, just wanna know the motivation. Perhaps as I read the rest of this PR it will make more sense to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made this its own function due to the repetitive nature of the original code I had reading in the original and subset CNV files. In other words, this was my attempt to DRY it up a bit (which I believe can now be achieved when I implement your suggestion of putting the datasets in a list and using lapply to execute this function and subsequent functions). |
||||||||
# Given the file path of the CNV data, read in the data. | ||||||||
# | ||||||||
# Arg: | ||||||||
# input_directory: file path of input directory | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to designate this separately than the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I do see how this can be confusing. I am sure this was better suited for my original, more complicated script, but making it one |
||||||||
# file_path: file path of input data | ||||||||
# | ||||||||
# Return: | ||||||||
# dataframe: the data frame containing the input data | ||||||||
|
||||||||
# Read in cnv data | ||||||||
dataframe <- | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is for sure CNV data, can you name this |
||||||||
read.table(gzfile(file.path(input_directory, file_path)), header = TRUE) | ||||||||
|
||||||||
# Rename the columns | ||||||||
dataframe <- | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
IDK if this is really all that better but at least you don't have to say There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is better from a memory standpoint. No need to copy the data frame. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I now realize that my comment above this line was misleading as this was meant to rearrange the columns, rather than simply rename them. That being said, the function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, do |
||||||||
dataframe[c("chrom", "loc.start", "loc.end", "ID", "num.mark", "seg.mean")] | ||||||||
|
||||||||
return(dataframe) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
filter_segmean <- function(dataframe){ | ||||||||
# Given the data.frame containing CNV caller output, return a data.frame with | ||||||||
# a column with the values 0 and 1 to represent loss and gain, respectively. | ||||||||
# | ||||||||
# Args: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding the segmean cutoff as its own argument option would be an easy way to make this function a little more versatile. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this idea. |
||||||||
# dataframe: data.frame containing CNV caller output | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to have the same comment here as above. |
||||||||
# | ||||||||
# Return: | ||||||||
# cnv_matrix: the data.frame given, returned with a column containing an | ||||||||
# aberration label for loss or gain | ||||||||
|
||||||||
# Rename columns for GenVisR function downstream | ||||||||
colnames(dataframe) <- | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh here you used |
||||||||
c("chromosome", | ||||||||
"start", | ||||||||
"end", | ||||||||
"sample", | ||||||||
"probes", | ||||||||
"segmean") | ||||||||
|
||||||||
cnv_matrix <- dataframe %>% | ||||||||
dplyr::mutate(aberration = "NA") %>% | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you need this step? See below: |
||||||||
dplyr::mutate(aberration = dplyr::case_when(segmean < -0.5 ~ 0, | ||||||||
segmean > 0.5 ~ 1)) %>% | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I believe that case_when takes what ever is not covered by the categories you give it and if you give it something straight as the last thing it will assign that to whatever is left. Test this out though. Also may be that you need it to be |
||||||||
dplyr::filter(!is.na(aberration)) | ||||||||
|
||||||||
return(cnv_matrix) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
plot_cnFreq <- | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
To stick with code style. |
||||||||
function(filtered_dataframe, | ||||||||
plot_type, | ||||||||
plot_title) { | ||||||||
# Given the data.frame filtered for size of aberrations, plot the proportion | ||||||||
# or frequency (denoted by the plot_type argument) of aberrations across | ||||||||
# chromosomes with `GenVisR::cnFreq` | ||||||||
# | ||||||||
# Args: | ||||||||
# filtered_dataframe: data.frame filtered for cutoff size of aberrations | ||||||||
# plot_type: the type of plot (proportion or frequency) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# plot_title: a title string for the plot produced | ||||||||
# | ||||||||
# Return: | ||||||||
# aberration_plot: plot depicting the aberrations detected across | ||||||||
# chromosomes | ||||||||
|
||||||||
aberration_plot <- GenVisR::cnFreq( | ||||||||
filtered_dataframe, | ||||||||
genome = "hg38", | ||||||||
CN_low_cutoff = 0, | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would probably be nice to have these cutoffs as arguments so this function has a little more flexibility. |
||||||||
CN_high_cutoff = .2, | ||||||||
plot_title = toupper(plot_title), | ||||||||
plotType = plot_type | ||||||||
) | ||||||||
|
||||||||
return(aberration_plot) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
plot_boxplot <- function(dataframe, plot_title) { | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there particular restrictions for this being a boxplot or could this data be better illustrated by a jitter or violin plot? I have not seen the data yet so there may be a reason why it needs to be a boxplot but I'd say generally violin or jitter plots are more informative. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here, as above. |
||||||||
# Given the data.frame filtered for size of aberrations, plot the proportion | ||||||||
# or frequency (denoted by the plot_type argument) of aberrations across | ||||||||
# chromosomes with `geom_boxplot` | ||||||||
# | ||||||||
# Args: | ||||||||
# dataframe: data.frame filtered for cutoff size of aberrations | ||||||||
# plot_title: a title string for the plot produced | ||||||||
# | ||||||||
# Return: | ||||||||
# boxplot: boxplot depicting the log2 transformed sizes of aberrations | ||||||||
# detected across chromosomes | ||||||||
|
||||||||
# Create boxplot where the y-axis represents the log2 transformed segmean | ||||||||
boxplot <- ggplot2::ggplot(dataframe, | ||||||||
ggplot2::aes(x = chromosome, | ||||||||
y = (log2(segmean) + 1))) + | ||||||||
ggplot2::geom_boxplot() + | ||||||||
ggplot2::theme_bw() + | ||||||||
ggplot2::ylab("Log transformed segmean values") + | ||||||||
ggplot2::ggtitle(toupper(plot_title)) | ||||||||
|
||||||||
return(boxplot) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
plot_histology_barplot <- function(dataframe, metadata, plot_title) { | ||||||||
# Given the data.frame filtered by cutoff segmean value, plot the proportion | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# of aberrations across chromosomes with `geom_barplot` | ||||||||
# | ||||||||
# Args: | ||||||||
# dataframe: data.frame filtered for cutoff size of aberrations and | ||||||||
# joined with the metadata | ||||||||
# metadata: the relevant metadata data.frame | ||||||||
# plot_title: a title string for the plot produced | ||||||||
# | ||||||||
# Return: | ||||||||
# barplot: barplot depicting the proportion of aberrations detected across | ||||||||
# chromosomes, annoated by the broad_histology variable in the | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# metadata | ||||||||
|
||||||||
# Create a data.frame with the filtered dataframe joined with the metadata | ||||||||
meta_joined <- dataframe %>% | ||||||||
dplyr::inner_join(metadata, by = c("sample" = "Kids_First_Biospecimen_ID")) | ||||||||
|
||||||||
# Create barplot where the y-axis represents the size of aberration | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second look, I am pretty confused by these plots. What is actually being plotted here? I had thought it was the number of each type in each chromosome, but the On some investigation, it looks like these ultimately produce the same plots as the ones that I was expecting, but if you leave off the |
||||||||
barplot <- ggplot2::ggplot(meta_joined, | ||||||||
ggplot2::aes(x = chromosome, | ||||||||
y = broad_histology, | ||||||||
fill = broad_histology)) + | ||||||||
ggplot2::geom_bar(stat = "identity") + | ||||||||
ggplot2::theme_bw() + | ||||||||
ggplot2::theme(axis.text.y = ggplot2::element_blank()) + | ||||||||
ggplot2::ggtitle(toupper(plot_title)) | ||||||||
|
||||||||
return(barplot) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
plot_aberration_barplot <- function(dataframe, plot_title) { | ||||||||
# Given the data.frame filtered by cutoff segmean value, plot the proportion | ||||||||
# of aberrations across chromosomes with `geom_barplot` | ||||||||
# | ||||||||
# Args: | ||||||||
# dataframe: data.frame filtered by cutoff segmean value | ||||||||
# plot_title: a title string for the plot produced | ||||||||
# | ||||||||
# Return: | ||||||||
# barplot: barplot depicting the proportion of aberrations detected across | ||||||||
# chromosomes | ||||||||
|
||||||||
|
||||||||
# Create barplot where the y-axis represents the size of aberration | ||||||||
barplot <- ggplot2::ggplot(dataframe, | ||||||||
ggplot2::aes(x = chromosome, | ||||||||
y = as.factor(aberration), | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See earlier comment about the y axis/stat |
||||||||
fill = as.factor(aberration))) + | ||||||||
ggplot2::geom_bar(stat = "identity") + | ||||||||
ggplot2::theme_bw() + | ||||||||
ggplot2::ylab("Proportion of Aberrations") + | ||||||||
ggplot2::labs(fill = "Loss(0)/Gain(1)") + | ||||||||
ggplot2::ggtitle(toupper(plot_title)) | ||||||||
|
||||||||
return(barplot) | ||||||||
|
||||||||
} | ||||||||
|
||||||||
plot_cowplot <- function(plot_a, plot_b, output_path, plot_name){ | ||||||||
# Given two plots, create a combined cowplot and save as a PDF in the | ||||||||
# specified directory | ||||||||
# Args: | ||||||||
# plot_a: the first plot, which will be positioned on top | ||||||||
# plot_b: the second plot, which will be positioned below the first | ||||||||
# output_path: the file.path to the output directory | ||||||||
# plot_name: the name the plot should be saved as | ||||||||
|
||||||||
# Save a combined cowplot plot of the cnvkit and controlfreec plots | ||||||||
grid <- cowplot::plot_grid(plot_a, plot_b, ncol = 1) | ||||||||
|
||||||||
cowplot::save_plot( | ||||||||
file.path(output_path, plot_name), | ||||||||
grid, | ||||||||
base_height = 12, | ||||||||
base_width = 30 | ||||||||
) | ||||||||
|
||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason not to install this from cran?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question, I cannot pinpoint my initial motivation here so I can change this to install from cran.