From cbd7fca18a6fdb812d6b5133c7aeb786b848a46a Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni Date: Tue, 28 Jan 2020 08:38:26 -0500 Subject: [PATCH] Back to exons; before GISTIC Pre: #452 --- .../01-prepare-cn-file.R | 256 +++++------------- 1 file changed, 71 insertions(+), 185 deletions(-) diff --git a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R index 5923067d9e..58619e3e4c 100644 --- a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R +++ b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R @@ -53,17 +53,17 @@ if (!("AnnotationDbi" %in% installed.packages())) { #### Function ------------------------------------------------------------------ process_annotate_overlaps <- function(cnv_df, - cds_gr, + txdb_exons, filt_na_symbol = TRUE) { # This function takes a standardized data.frame that contains genomic range - # information and finds the overlaps with a CDS GRanges object. It will - # discard Ensembl gene IDs with no gene symbol by default. + # information and finds the overlaps with a TxDb object. It will discard + # Ensembl gene IDs with no gene symbol by default. # # Args: # cnv_df: standardized data.frame that contains the segments used in the # CNV caller - # cds_gr: `GenomicRanges` object containing coding sequences to be merged - # with the CNV data.frame; output of `run-prepare-cn.sh` + # txdb_exons: exons to be merged with the CNV data.frame; output o + # GenomicFeatures::exons # filt_na_symbol: logical, if TRUE, rows without gene symbols will be # removed; default is TRUE # @@ -74,35 +74,24 @@ process_annotate_overlaps <- function(cnv_df, # Make CNV data.frame a GRanges object cnv_gr <- cnv_df %>% - GenomicRanges::makeGRangesFromDataFrame( - keep.extra.columns = TRUE, - starts.in.df.are.0based = FALSE - ) - - # Find the overlaps between the CNV file and GENCODE v27 filtered hg38 - # annotations file (converted to bed file in `run-prepare-cn.sh`). - overlaps <- GenomicRanges::findOverlaps(cnv_gr, cds_gr) - - # Carry over gene ids to `cnv_gr` object - cnv_gr$gene_id <- rep(NA, length(cnv_gr)) - - cnv_gr$gene_id[overlaps@from] <- cds_gr$gene_id[overlaps@to] + GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE, + starts.in.df.are.0based = FALSE) - # Find the overlaps between the CNV file and UCSC cytoband data - overlaps_ucsc <- GenomicRanges::findOverlaps(cnv_gr, ucsc_cytoband_gr) + # Create a data.frame with the overlaps between the CNV file and hg38 genome + # annotations + overlaps <- IRanges::mergeByOverlaps(cnv_gr, tx_exons) - # Create a data.frame with selected columns from the `overlaps` + # Create a data.frame with selected columns from the `autosome_overlaps` # object annotated_cn <- data.frame( - biospecimen_id = cnv_gr$Kids_First_Biospecimen_ID[overlaps_ucsc@from], - status = cnv_gr$status[overlaps_ucsc@from], - copy_number = cnv_gr$copy_number[overlaps_ucsc@from], - ploidy = cnv_gr$tumor_ploidy[overlaps_ucsc@from], - ensembl = cnv_gr$gene_id[overlaps_ucsc@from], - cytoband = ucsc_cytoband_gr@elementMetadata@listData$cytoband[overlaps_ucsc@to], + biospecimen_id = overlaps$Kids_First_Biospecimen_ID, + status = overlaps$status, + copy_number = overlaps$copy_number, + ploidy = overlaps$tumor_ploidy, + ensembl = unlist(overlaps$gene_id), stringsAsFactors = FALSE ) %>% - dplyr::filter(!(is.na(ensembl))) %>% + dplyr::distinct() %>% # Discard the gene version information in order to get gene symbols and # cytoband mappings dplyr::mutate(ensembl = gsub("\\..*", "", ensembl)) @@ -110,13 +99,14 @@ process_annotate_overlaps <- function(cnv_df, annotated_cn <- annotated_cn %>% dplyr::mutate( gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, - keys = ensembl, - column = "SYMBOL", - keytype = "ENSEMBL" - ) - ) %>% - dplyr::select(-cytoband) %>% - dplyr::distinct() + keys = ensembl, + column = "SYMBOL", + keytype = "ENSEMBL"), + cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, + keys = ensembl, + column = "MAP", + keytype = "ENSEMBL") + ) # drop rows that have NA values for the gene symbol if (filt_na_symbol) { @@ -138,10 +128,10 @@ option_list <- list( help = "file path to file that contains CNV information" ), optparse::make_option( - c("--cds_file"), + c("--gtf_file"), type = "character", default = NULL, - help = "file path to human genome cds filtered GTF file" + help = "file path to human genome GTF file" ), optparse::make_option( c("--metadata"), @@ -169,13 +159,6 @@ option_list <- list( default = FALSE, help = "flag used to indicate if the CNV file is the output of CNVkit" ), - optparse::make_option( - c("--gistic"), - type = "logical", - action = "store_true", - default = FALSE, - help = "flag used to indicate if the GISTIC file should be incorporated" - ), optparse::make_option( c("--xy"), type = "integer", @@ -217,72 +200,14 @@ if (!dir.exists(results_dir)) { dir.create(results_dir) } -if (opt$gistic) { - # Path to gistic zip file results - gistic_zip <- - file.path(root_dir, "data", "pbta-cnv-cnvkit-gistic.zip") - - # Get the name of the folder first since this changes with updates to the date - folder_name <- unzip(zipfile = gistic_zip, list = TRUE) %>% - dplyr::filter(Length == 0) %>% - dplyr::pull("Name") - - # Path to gistic results directory - gistic_results <- - file.path(root_dir, - "analyses", - "focal-cn-file-preparation", - "gistic-results") - - # Extract files if it hasn't been done yet - if (!dir.exists(gistic_results)) { - # Unzip but only extract the files not the folder - unzip(zipfile = gistic_zip, - exdir = gistic_results) - } - # Designate GISTIC results folder to extract to - gistic_results <- file.path(gistic_results, folder_name) - - # Read in broad values by arm GISTIC output file - gistic_df <- - data.table::fread( - paste0( - gistic_results, - "broad_values_by_arm.txt" - ), - data.table = FALSE - ) -} - -# Read in UCSC cytoband data. The decision to implement the UCSC hg38 cytoband -# file was made based on a comparison done between the cytoband calls in the -# `org.Hs.eg.db` package and the calls in the UCSC file. We found that they -# disagreed in ~11,800 calls out of ~800,000 and the `UCSC file` contains more -# cytoband calls. -ucsc_cytoband <- - data.table::fread( - "http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz" - ) - -# Make the UCSC cytoband data.frame a GRanges object -ucsc_cytoband_gr <- ucsc_cytoband %>% - dplyr::select(chr = V1, start = V2, end = V3, cytoband = V4) %>% - dplyr::mutate(cytoband = paste0(gsub("chr", "", chr), cytoband)) %>% - GenomicRanges::makeGRangesFromDataFrame( - keep.extra.columns = TRUE, - starts.in.df.are.0based = FALSE - ) - #### Format CNV file and overlap with hg38 genome annotations ------------------ -# We want to standardize the formats between the two methods here and drop +# we want to standardize the formats between the two methods here and drop # columns we won't need to if (opt$cnvkit) { cnv_df <- readr::read_tsv(opt$cnv_file) %>% - dplyr::rename( - chr = chrom, start = loc.start, end = loc.end, - copy_number = copy.num - ) %>% + dplyr::rename(chr = chrom, start = loc.start, end = loc.end, + copy_number = copy.num) %>% dplyr::select(-num.mark, -seg.mean) %>% dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything()) } @@ -292,10 +217,8 @@ if (opt$controlfreec) { cnv_df <- readr::read_tsv(opt$cnv_file) %>% dplyr::rename(copy_number = copy.number) %>% dplyr::mutate(chr = paste0("chr", chr)) %>% - dplyr::select( - -segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue, - -KolmogorovSmirnovPvalue - ) %>% + dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue, + -KolmogorovSmirnovPvalue) %>% dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything()) } @@ -305,21 +228,32 @@ histologies_df <- readr::read_tsv(opt$metadata) #### Annotation file ----------------------------------------------------------- -# Read in prepared cds file -cds_df <- data.table::fread(opt$cds_file, data.table = FALSE) - -# Create the GRanges object -cds_gr <- cds_df %>% - dplyr::select( - seqnames = V1, - start = V2, - end = V3, - gene_id = V4 - ) %>% - GenomicRanges::makeGRangesFromDataFrame( - keep.extra.columns = TRUE, - starts.in.df.are.0based = FALSE +# this is the output of GenomicFeatures::makeTxDbFromGFF +# TODO: possibly update this when the GTF file gets included in the data +# download; may also remove the --gtf_file option and hardcode it? +annotation_directory <- file.path(root_dir, + "analyses", + "focal-cn-file-preparation", + "annotation_files") +annotation_file <- file.path(annotation_directory, + "txdb_from_gencode.v27.gtf.db") + +if (!file.exists(annotation_file)) { + # Define the annotations for the hg38 genome + txdb <- GenomicFeatures::makeTxDbFromGFF( + file = opt$gtf_file, + format = "gtf" ) + # can do this even if the directory exists + dir.create(annotation_directory, showWarnings = FALSE) + # write this to file to save time next time + AnnotationDbi::saveDb(txdb, annotation_file) +} else { + txdb <- AnnotationDbi::loadDb(annotation_file) +} + +# extract the exons but include ensembl gene identifiers +tx_exons <- GenomicFeatures::exons(txdb, columns = "gene_id") #### Addressing autosomes first ------------------------------------------------ @@ -328,52 +262,20 @@ cnv_no_xy <- cnv_df %>% dplyr::filter(!(chr %in% c("chrX", "chrY")), status != "neutral") # Merge and annotated no X&Y -autosome_annotated_cn <- process_annotate_overlaps( - cnv_df = cnv_no_xy, - cds_gr = cds_gr -) %>% +autosome_annotated_cn <- process_annotate_overlaps(cnv_df = cnv_no_xy, + txdb_exons = tx_exons) %>% # mark possible amplifications in autosomes dplyr::mutate(status = dplyr::case_when( copy_number > (2 * ploidy) ~ "amplification", TRUE ~ status )) -#### Wrangle GISTIC data for autosome data.frame------------------------------- - -if (opt$gistic) { - # Transpose the GISTIC data.frame - transposed_gistic_df <- gistic_df %>% - tibble::column_to_rownames("Chromosome Arm") %>% - t() %>% - as.data.frame() %>% - tibble::rownames_to_column("biospecimen_id") %>% - # Gather the chromosome arm data into one column and the GISTIC call in another - tidyr::gather(key = "arm", value = "gistic_call", -biospecimen_id) %>% - # Filter the GISTIC data to only the overlapping samples - dplyr::filter(biospecimen_id %in% autosome_annotated_cn$biospecimen_id) %>% - # Recode the gistic - dplyr::mutate(broad_status = dplyr::case_when( - gistic_call == -1 ~ "loss", - gistic_call == 0 ~ "neutral", - gistic_call == 1 ~ "gain", - TRUE ~ "amplification" - )) %>% - dplyr::select(-gistic_call) - - # Add GISTIC data to final data.frame and remove sex chromosomes - autosome_annotated_cn <- transposed_gistic_df %>% - dplyr::filter(!(arm %in% c("Xp", "Xq", "Yp", "Yq"))) %>% - dplyr::inner_join(autosome_annotated_cn, by = "biospecimen_id") -} - # Output file name -autosome_output_file <- paste0(opt$filename_lead, "_autosomes.tsv.bz2") +autosome_output_file <- paste0(opt$filename_lead, "_autosomes.tsv.gz") # Save final data.frame to a tsv file -readr::write_tsv( - autosome_annotated_cn, - file.path(results_dir, autosome_output_file) -) +readr::write_tsv(autosome_annotated_cn, + file.path(results_dir, autosome_output_file)) #### X&Y ----------------------------------------------------------------------- @@ -383,37 +285,21 @@ if (xy_flag) { dplyr::filter(chr %in% c("chrX", "chrY")) # Merge and annotated no X&Y - sex_chrom_annotated_cn <- process_annotate_overlaps( - cnv_df = cnv_sex_chrom, - cds_gr = cds_gr - ) + sex_chrom_annotated_cn <- process_annotate_overlaps(cnv_df = cnv_sex_chrom, + txdb_exons = tx_exons) # Add germline sex estimate into this data.frame sex_chrom_annotated_cn <- sex_chrom_annotated_cn %>% - dplyr::inner_join(dplyr::select( - histologies_df, - Kids_First_Biospecimen_ID, - germline_sex_estimate - ), - by = c("biospecimen_id" = "Kids_First_Biospecimen_ID") - ) %>% + dplyr::inner_join(dplyr::select(histologies_df, + Kids_First_Biospecimen_ID, + germline_sex_estimate), + by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")) %>% dplyr::select(-germline_sex_estimate, dplyr::everything()) - if (opt$gistic) { - #### Wrangle GISTIC data for sex chromosome data.frame----------------------- - - # Add GISTIC data to final data.frame and select only the sex chromsomes - sex_chrom_annotated_cn <- transposed_gistic_df %>% - dplyr::filter(arm %in% c("Xp", "Xq", "Yp", "Yq")) %>% - dplyr::inner_join(sex_chrom_annotated_cn, by = "biospecimen_id") - } - # Output file name - sex_chrom_output_file <- paste0(opt$filename_lead, "_x_and_y.tsv.bz2") + sex_chrom_output_file <- paste0(opt$filename_lead, "_x_and_y.tsv.gz") # Save final data.frame to a tsv file - readr::write_tsv( - sex_chrom_annotated_cn, - file.path(results_dir, sex_chrom_output_file) - ) + readr::write_tsv(sex_chrom_annotated_cn, + file.path(results_dir, sex_chrom_output_file)) }