Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Update CNV segment to gene mapping: support both formats, use GTF, etc. #253

Merged
merged 14 commits into from
Nov 9, 2019
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
---
title: "Add ploidy column, status to CNVkit output"
output: html_notebook
author: J. Taroni for ALSF CCDL
date: 2019
---

The `pbta-histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC.
The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)).

This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.

```{r}
library(dplyr)
```

### Read in data

```{r}
cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
cnvkit_df <- readr::read_tsv(cnvkit_file)
```

```{r}
histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
histologies_df <- readr::read_tsv(histologies_file)
```

### Add inferred ploidy information to CNVkit results

```{r}
add_ploidy_df <- histologies_df %>%
select(Kids_First_Biospecimen_ID, tumor_ploidy) %>%
inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
select(-tumor_ploidy, everything())
```

### Add status column

This is intended to mirror the information contained in the ControlFreeC output.

```{r}
add_ploidy_df <- add_ploidy_df %>%
mutate(status = case_when(
# when the copy number is less than inferred ploidy, mark this as a loss
copy.num < tumor_ploidy ~ "loss",
# if copy number is higher than ploidy, mark as a gain
copy.num > tumor_ploidy ~ "gain",
copy.num == tumor_ploidy ~ "neutral"
))

head(add_ploidy_df, 10)
```

### Write to `scratch`

```{r}
output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
readr::write_tsv(add_ploidy_df, output_file)
```
1,932 changes: 1,932 additions & 0 deletions analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html

Large diffs are not rendered by default.

170 changes: 121 additions & 49 deletions analyses/focal-cn-file-preparation/01-prepare-cn-file.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if (!("IRanges" %in% installed.packages())) {
BiocManager::install("IRanges", update = FALSE)
}

# Install annotatr
# Install annotatr
if (!("annotatr" %in% installed.packages())) {
BiocManager::install("annotatr", update = FALSE)
}
Expand All @@ -55,87 +55,159 @@ if (!("AnnotationDbi" %in% installed.packages())) {
# Declare command line options
option_list <- list(
optparse::make_option(
c("-f", "--seg_file"),
c("--cnv_file"),
type = "character",
default = NULL,
help = "file path to SEG file that contains cnv information"
help = "file path to file that contains CNV information"
),
optparse::make_option(
c("--gtf_file"),
type = "character",
default = NULL,
help = "file path to human genome GTF file"
),
optparse::make_option(
c("--filename_lead"),
type = "character",
default = "annotated_cn",
help = "used in file names"
),
optparse::make_option(
c("--controlfreec"),
type = "logical",
action = "store_true",
default = FALSE,
help = "flag used to indicate if the CNV file is the output of ControlFreeC"
),
optparse::make_option(
c("--cnvkit"),
type = "logical",
action = "store_true",
default = FALSE,
help = "flag used to indicate if the CNV file is the output of CNVkit"
)
)

# Read the arguments passed
opt_parser <- optparse::OptionParser(option_list = option_list)
opt <- optparse::parse_args(opt_parser)

# error handling related to specifying the CNV method
if (all(opt$controlfreec, opt$cnvkit)) {
stop("--controlfreec and --cnvkit are mutually exclusive")
}

if (!any(opt$controlfreec, opt$cnvkit)) {
stop("You must specify the CNV file format by using --controlfreec or
--cnvkit")
}

#### Directories and Files -----------------------------------------------------

# Detect the ".git" folder -- this will in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions no
# matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set path to results directory
# Set path to results directory
results_dir <-
file.path(root_dir, "analyses", "focal-cn-file-preparation", "results")

if (!dir.exists(results_dir)) {
dir.create(results_dir)
}

seg_df <-
data.table::fread(
opt$seg_file,
data.table = FALSE,
stringsAsFactors = FALSE
)
#### Format CNV file and overlap with hg38 genome annotations ------------------

#### Format seg file and overlap with hg38 genome annotations ------------------
# we want to standardize the formats between the two methods here and drop
# columns we won't need to
if (opt$cnvkit) {
cnv_df <- readr::read_tsv(opt$cnv_file) %>%
dplyr::rename(chr = chrom, start = loc.start, end = loc.end,
copy_number = copy.num) %>%
dplyr::select(-num.mark, -seg.mean) %>%
dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
}

# Exclude the X and Y chromosomes, exclude `copy.num` == 2, and rearrange
# ID column to be the last column
seg_no_xy <- seg_df %>%
dplyr::filter(!(chrom %in% c("chrX", "chrY")), (copy.num != 2)) %>%
dplyr::select(-ID, dplyr::everything())
if (opt$controlfreec) {
# TODO: filter based on the p-values rather than just dropping them?
cnv_df <- readr::read_tsv(opt$cnv_file) %>%
dplyr::rename(copy_number = copy.number) %>%
dplyr::mutate(chr = paste0("chr", chr)) %>%
dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue,
-KolmogorovSmirnovPvalue) %>%
dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
}

# Make seg data.frame a GRanges object
seg_gr <- seg_no_xy %>%
# Remove neutral copy number calls and mark possible amplifcation
cnv_df <- cnv_df %>%
dplyr::filter(status != "neutral") %>%
dplyr::mutate(status = dplyr::case_when(
copy_number > (2 * tumor_ploidy) ~ "amplification",
TRUE ~ status
))

# Addressing autosomes first
# Exclude the X and Y chromosomes
cnv_no_xy <- cnv_df %>%
dplyr::filter(!(chr %in% c("chrX", "chrY")))

# Make seg data.frame a GRanges object
cnv_no_xy_gr <- cnv_no_xy %>%
GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE,
starts.in.df.are.0based = FALSE)

# Define the annotations for the hg38 genome
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
genes <- GenomicFeatures::genes(txdb)

# Create a data.frame with the overlaps between the seg file and hg38 genome
# annotations
overlaps <- IRanges::mergeByOverlaps(seg_gr, genes)

overlapSymbols <-
AnnotationDbi::mapIds(
org.Hs.eg.db::org.Hs.eg.db,
keys = overlaps$gene_id,
column = "SYMBOL",
keytype = "ENTREZID"
)
txdb <- GenomicFeatures::makeTxDbFromGFF(
file = opt$gtf_file,
format = "gtf"
)

# we'll only look at genes on chromosomes 1:22 -- this will get rid of things
# like alternates
chroms <- paste0("chr", 1:22)
chrom_filter <- list(tx_chrom = chroms)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With GenomicFeatures::exons(), this should not be necessary, as no exon is mapped to multiple/alternate chromosomes. I don't know if any of the calls fall on non-canonical chromosomes, but we might not want to exclude them at this point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought was it might be more efficient if we drop anything outside this filter, but I have no evidence whatsoever to suggest that I am right about that. I will take it out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay - this step in CI takes much longer (granted I made other changes), but I'm thinking of implementing a filter for CI only per https://github.com/AlexsLemonade/OpenPBTA-analysis#passing-variables-only-in-ci.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To close the loop - that wasn't the issue and I was looking at the wrong branch 🙃 I will leave in the filtering changes though


# extract the genes using the chromosome filter
chr_exons <- GenomicFeatures::exons(txdb,
filter = chrom_filter,
columns = "gene_id")

# Create a data.frame with the overlaps between the seg file and hg38 genome
# annotations
overlaps <- IRanges::mergeByOverlaps(cnv_no_xy_gr, chr_exons)

overlaps$gene_id <- overlapSymbols
# remove what we no longer need from the environment
rm(chr_exons, chroms, chrom_filter, cnv_no_xy_gr, cnv_no_xy)

# Create a data.frame with selected columns from the `overlaps` object
cn_short <- data.frame(
gene_symbol = overlaps$gene_id,
biospecimen_id = overlaps$ID,
copy_number = overlaps$copy.num) %>%
dplyr::filter(!(is.na(gene_symbol)))

# Create a `label` column with values specifying the type of CN aberration and
# save as a data.frame with `gene_symbol`, `biospecimen_id`, and `copy_number`
annotated_cn <- cn_short %>%
annotated_cn <- data.frame(ensembl = unlist(overlaps$gene_id),
biospecimen_id = overlaps$Kids_First_Biospecimen_ID,
status = overlaps$status,
copy_number = overlaps$copy_number,
ploidy = overlaps$tumor_ploidy) %>%
dplyr::distinct() %>%
dplyr::mutate(label = dplyr::case_when(
copy_number == 3 | copy_number == 4 ~ "Gain",
copy_number == 0 ~ "Hom_Deletion",
copy_number == 1 ~ "Hem_Deletion",
copy_number >= 5 ~ "Amplification")) %>%
dplyr::select(gene_symbol, biospecimen_id, label, copy_number)
# Discard the gene version information in order to get gene symbols and
# cytoband mappings
dplyr::mutate(ensembl = gsub("\\..*", "", ensembl))

annotated_cn <- annotated_cn %>%
dplyr::mutate(
gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
keys = ensembl,
column = "SYMBOL",
keytype = "ENSEMBL"),
cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
keys = ensembl,
column = "MAP",
keytype = "ENSEMBL")
) %>%
dplyr::filter(!is.na(gene_symbol)) %>%
# put the ensembl identifiers last
dplyr::select(-ensembl, dplyr::everything())

# Output file name
output_file <- paste0(opt$filename_lead, "_autosomes.tsv")

# Save final data.frame to a tsv file
readr::write_tsv(annotated_cn, file.path(results_dir, "annotated_cn.tsv"))
readr::write_tsv(annotated_cn, file.path(results_dir, output_file))
Binary file not shown.
Binary file not shown.
21 changes: 20 additions & 1 deletion analyses/focal-cn-file-preparation/run-prepare-cn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,24 @@ script_directory="$(perl -e 'use File::Basename;
print dirname(abs_path(@ARGV[0]));' -- "$0")"
cd "$script_directory" || exit

# Prep the CNVkit data
Rscript --vanilla -e "rmarkdown::render('00-add-ploidy-cnvkit.Rmd', clean = TRUE)"

# Run annotation step for CNVkit
Rscript --vanilla 01-prepare-cn-file.R \
--cnv_file ../../scratch/cnvkit_with_status.tsv \
# TODO: update once this is included with download
--gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
--filename_lead "cnvkit_annotated_cn" \
--cnvkit

# Run annotation step for ControlFreeC
Rscript --vanilla 01-prepare-cn-file.R \
--seg_file ../../data/pbta-cnv-cnvkit.seg.gz
--cnv_file ../../data/pbta-cnv-controlfreec.tsv.gz \
# TODO: update once this is included with download
--gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
--filename_lead "controlfreec_annotated_cn" \
--controlfreec

# gzip everything in the results folder
gzip -r results