diff --git a/DESCRIPTION b/DESCRIPTION index 3b63fd2..9f8399f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro License: GPL-3 Type: Package Title: Battenberg subclonal copy number caller -Version: 2.2.8 +Version: 2.2.9 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"), person("Peter", "Van Loo", role=c("aut")), person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")), @@ -19,17 +19,17 @@ Depends: grDevices Imports: RColorBrewer, - ASCAT (>= 2.5), + ASCAT (>= 2.5.1), ggplot2, readr, gtools, gridExtra, doParallel, parallel, - foreach + foreach, + splines URL: https://github.com/Wedge-Oxford/battenberg LazyLoad: yes -License: file LICENSE Suggests: testthat -RoxygenNote: 6.0.1 +RoxygenNote: 7.1.0 diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 index 4d5576a..6f311ea --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,13 @@ -#FROM r-base FROM ubuntu:16.04 +USER root + # Add dependencies RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl git libssl-dev curl RUN mkdir /tmp/downloads -RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.2.1.tar.gz && \ +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.7.tar.gz && \ mkdir /tmp/downloads/htslib && \ tar -C /tmp/downloads/htslib --strip-components 1 -zxf tmp.tar.gz && \ make -C /tmp/downloads/htslib && \ @@ -14,7 +15,7 @@ RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archiv ENV HTSLIB /tmp/downloads/htslib -RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v2.1.2.tar.gz && \ +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v4.0.0.tar.gz && \ mkdir /tmp/downloads/alleleCount && \ tar -C /tmp/downloads/alleleCount --strip-components 1 -zxf tmp.tar.gz && \ cd /tmp/downloads/alleleCount/c && \ @@ -30,21 +31,34 @@ RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/imp cp /tmp/downloads/impute2/impute2 /usr/local/bin && \ rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz -RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach"))' +RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("gtools", "optparse", "devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach", "splines"))' RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' -RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' -RUN curl -sSL -o battenberg_wgs.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R +RUN mkdir -p /opt/battenberg +COPY . /opt/battenberg/ +RUN R -q -e 'install.packages("/opt/battenberg", repos=NULL, type="source")' + # modify paths to reference files -RUN cat battenberg_wgs.R | \ +RUN cat /opt/battenberg/inst/example/battenberg_wgs.R | \ sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ sed 's|G1000PREFIX_AC = \".*|G1000PREFIX_AC = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesloci2012_chr\"|' | \ sed 's|GCCORRECTPREFIX = \".*|GCCORRECTPREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_gcContent/1000_genomes_GC_corr_chr_\"|' | \ - sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' > /usr/local/bin/battenberg_wgs.R && rm -f battenberg_wgs.R + sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' | \ + sed 's|REPLICCORRECTPREFIX = \".*|REPLICCORRECTPREFIX = \"/opt/battenberg_reference/battenberg_wgs_replic_correction_1000g_v3/1000_genomes_replication_timing_chr_\"|' > /usr/local/bin/battenberg_wgs.R + +RUN cp /opt/battenberg/inst/example/filter_sv_brass.R /usr/local/bin/filter_sv_brass.R +RUN cp /opt/battenberg/inst/example/battenberg_cleanup.sh /usr/local/bin/battenberg_cleanup.sh -#RUN curl -sSL -o battenberg_snp6.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R -#RUN cat battenberg_snp6.R | \ +#RUN cat /opt/battenberg/inst/example/battenberg_snp6.R | \ # sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ # sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ -# sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R && rm -f battenberg_wgs.R +# sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R + +## USER CONFIGURATION +RUN adduser --disabled-password --gecos '' ubuntu && chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu + +CMD ["/bin/bash"] diff --git a/NAMESPACE b/NAMESPACE index d715f18..c420e17 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,10 @@ # Generated by roxygen2: do not edit by hand -export(plot.haplotype.data) +export() export(GetChromosomeBAFs) export(GetChromosomeBAFs_SNP6) export(allele_ratio_plot) export(battenberg) -export(calc_psi_t) export(calc_rho_psi_refit) export(callSubclones) export(cel2baf.logr) @@ -13,7 +12,6 @@ export(cnfit_to_refit_suggestions) export(combine.baf.files) export(combine.impute.output) export(coverage_plot) -export(find_centroid_of_global_minima) export(fit.copy.number) export(gc.correct) export(gc.correct.wgs) @@ -25,14 +23,14 @@ export(getBAFsAndLogRs) export(infer_gender_birdseed) export(make_posthoc_plots) export(parse.imputeinfofile) +export(plot.haplotype.data) export(prepare_snp6) export(prepare_wgs) export(read_table_generic) export(run.impute) -export(runASCAT) -export(run_clonal_ASCAT) export(run_haplotyping) export(segment.baf.phased) +export(segment.baf.phased.legacy) export(segment.baf.phased.sv) export(squaresplot) export(suggest_refit) @@ -55,4 +53,7 @@ importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) importFrom(parallel,makeCluster) importFrom(parallel,stopCluster) +importFrom(readr,cols) importFrom(readr,read_table) +importFrom(readr,write_tsv) +importFrom(splines,ns) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index 0445e5d..ad54e51 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -1,10 +1,11 @@ #' @import stats graphics grDevices utils ggplot2 #' @importFrom RColorBrewer brewer.pal -#' @importFrom readr read_table +#' @importFrom readr read_table write_tsv cols #' @importFrom gridExtra grid.arrange arrangeGrob #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort #' @importFrom parallel makeCluster stopCluster #' @importFrom doParallel registerDoParallel #' @importFrom foreach foreach %dopar% +#' @importFrom splines ns NULL diff --git a/R/battenberg.R b/R/battenberg.R old mode 100644 new mode 100755 index d27d457..2c4810f --- a/R/battenberg.R +++ b/R/battenberg.R @@ -8,7 +8,8 @@ #' @param imputeinfofile Full path to a Battenberg impute info file with pointers to Impute2 reference data #' @param g1000prefix Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data #' @param problemloci Full path to a problem loci file that contains SNP loci that should be filtered out -#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) +#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NULL) +#' @param repliccorrectprefix Full prefix path to replication timing files, as part of the Battenberg reference data, not required for SNP6 data (Default: NULL) #' @param g1000allelesprefix Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) #' @param ismale A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA) #' @param data_type String that contains either wgs or snp6 depending on the supplied input data (Default: wgs) @@ -30,7 +31,7 @@ #' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10) #' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20) #' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35) -#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1) +#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median, 3=ifelse median==0 | 1, mean, median (Default: 3) #' @param skip_allele_counting Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE) #' @param skip_preprocessing Provide TRUE when preprocessing is already complete (Default: FALSE) #' @param skip_phasing Provide TRUE when phasing is already complete (Default: FALSE) @@ -40,35 +41,45 @@ #' @param norm.geno.clust.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl) #' @param birdseed_report_file Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt) #' @param heterozygousFilter Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none") +#' @param prior_breakpoints_file A two column file with prior breakpoints to be used during segmentation (Default: NULL) #' @author sd11 #' @export -battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, - gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, +battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, gccorrectprefix=NULL, + repliccorrectprefix=NULL, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, - min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F, + min_map_qual=35, calc_seg_baf_option=3, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F, snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", - norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") { + norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none", + prior_breakpoints_file=NULL) { requireNamespace("foreach") requireNamespace("doParallel") requireNamespace("parallel") if (data_type=="wgs" & is.na(ismale)) { - print("Please provide a boolean denominator whether this sample represents a male donor") - q(save="no", status=1) + stop("Please provide a boolean denominator whether this sample represents a male donor") } if (data_type=="wgs" & is.na(g1000allelesprefix)) { - print("Please provide a path to 1000 Genomes allele reference files") - q(save="no", status=1) + stop("Please provide a path to 1000 Genomes allele reference files") } - if (data_type=="wgs" & is.na(gccorrectprefix)) { - print("Please provide a path to GC content reference files") - q(save="no", status=1) + if (data_type=="wgs" & is.null(gccorrectprefix)) { + stop("Please provide a path to GC content reference files") } + if (!file.exists(problemloci)) { + stop("Please provide a path to a problematic loci file") + } + + if (!file.exists(imputeinfofile)) { + stop("Please provide a path to an impute info file") + } + + # check whether the impute_info.txt file contains correct paths + check.imputeinfofile(imputeinfofile, ismale) + if (data_type=="wgs" | data_type=="WGS") { chrom_names = get.chrom.names(imputeinfofile, ismale) logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") @@ -92,7 +103,8 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file normalname=normalname, g1000allelesprefix=g1000allelesprefix, g1000prefix=g1000prefix, - gccorrectprefix=gccorrectprefix, + gccorrectprefix=gccorrectprefix, + repliccorrectprefix=repliccorrectprefix, min_base_qual=min_base_qual, min_map_qual=min_map_qual, allelecounter_exe=allelecounter_exe, @@ -164,6 +176,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file segment.baf.phased(samplename=tumourname, inputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), outputfile=paste(tumourname, ".BAFsegmented.txt", sep=""), + prior_breakpoints_file=prior_breakpoints_file, gamma=segmentation_gamma, phasegamma=phasing_gamma, kmin=segmentation_kmin, @@ -198,7 +211,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), output.gw.figures.prefix=paste(tumourname,"_BattenbergProfile", sep=""), masking_output_file=paste(tumourname, "_segment_masking_details.txt", sep=""), - sv_breakpoints_file="NA", + prior_breakpoints_file=prior_breakpoints_file, chr_names=chrom_names, gamma=platform_gamma, segmentation.gamma=NA, @@ -222,9 +235,5 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), gamma_param=platform_gamma) - - - - } diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R index a7c96a5..49a5aad 100755 --- a/R/clonal_ascat.R +++ b/R/clonal_ascat.R @@ -1423,6 +1423,8 @@ runASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choi psi = NA ploidy = NA rho = NA + psi_opt1_plot = -1 + rho_opt1_plot = -1 } # separated plotting from logic: create distanceplot here @@ -1634,6 +1636,12 @@ run_clonal_ASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, s # Recalculate the psi_t for this rho using only clonal segments psi_t = recalc_psi_t(psi_without_ref, rho_without_ref, gamma_param, lrrsegmented, segBAF.table, siglevel_BAF, maxdist_BAF, include_subcl_segments=F) + + # If there aren't any clonally fit segments, the above yields NA. In this case, revert to the original grid search psi_t + if (is.na(psi_t)) { + print("Recalculated psi_t was NA, reverting to grid search solution. This occurs when no segment could be fit with a clonal state, check sample for contamination") + psi_t = psi_without_ref + } output_optimum_pair = list(psi = psi_opt1, rho = rho_opt1, ploidy = ploidy_opt1) #output_optimum_pair_without_ref = list(psi = psi_without_ref, rho = rho_without_ref, ploidy = ploidy_without_ref) diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index 99614d0..cf21d49 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -39,9 +39,9 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente } # Read in the required data - segmented.BAF.data = read.table(inputfile.baf.segmented, header=T, stringsAsFactors=F) - raw.BAF.data = as.data.frame(read_table_generic(inputfile.baf)) - raw.logR.data = as.data.frame(read_table_generic(inputfile.logr)) + segmented.BAF.data = as.data.frame(read_bafsegmented(inputfile.baf.segmented)) + raw.BAF.data = as.data.frame(read_baf(inputfile.baf)) + raw.logR.data = as.data.frame(read_logr(inputfile.logr)) # Assign rownames as those are required by various clonal_ascat.R functions # If there are duplicates (possible with old versions of BB) then remove those @@ -65,20 +65,24 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente # raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) #} - BAF.data = NULL - logR.data = NULL - segmented.logR.data = NULL - matched.segmented.BAF.data = NULL + BAF.data = list() + logR.data = list() + segmented.logR.data = list() + matched.segmented.BAF.data = list() chr.names = unique(segmented.BAF.data[,1]) + baf_segmented_split = split(segmented.BAF.data, f=segmented.BAF.data$Chromosome) + baf_split = split(raw.BAF.data, f=raw.BAF.data$Chromosome) + logr_split = split(raw.logR.data, f=raw.logR.data$Chromosome) + # For each chromosome for(chr in chr.names){ - chr.BAF.data = raw.BAF.data[raw.BAF.data$Chromosome==chr,] + chr.BAF.data = baf_split[[chr]] + # Skip the rest if there is no data for this chromosome if(nrow(chr.BAF.data)==0){ next } # Match segments with chromosome position - # TODO: Make sure there are column names for this file at the output of segmentation - chr.segmented.BAF.data = segmented.BAF.data[segmented.BAF.data[,1]==chr,] + chr.segmented.BAF.data = baf_segmented_split[[chr]] indices = match(chr.segmented.BAF.data[,2],chr.BAF.data$Position ) if (sum(is.na(indices))==length(indices) | length(indices)==0) { @@ -89,13 +93,13 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente chr.segmented.BAF.data = chr.segmented.BAF.data[!is.na(indices),] # Append the segmented data - matched.segmented.BAF.data = rbind(matched.segmented.BAF.data, chr.segmented.BAF.data) - BAF.data = rbind(BAF.data, chr.BAF.data[indices[!is.na(indices)],]) + matched.segmented.BAF.data[[chr]] = chr.segmented.BAF.data + BAF.data[[chr]] = chr.BAF.data[indices[!is.na(indices)],] # Append raw LogR - chr.logR.data = raw.logR.data[raw.logR.data$Chromosome==chr,] + chr.logR.data = logr_split[[chr]] indices = match(chr.segmented.BAF.data[,2],chr.logR.data$Position) - logR.data = rbind(logR.data, chr.logR.data[indices[!is.na(indices)],]) + logR.data[[chr]] = chr.logR.data[indices[!is.na(indices)],] chr.segmented.logR.data = chr.logR.data[indices[!is.na(indices)],] # Append segmented LogR @@ -104,24 +108,33 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente for(s in 1:length(segs)){ chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3] = mean(chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3], na.rm=T) } - segmented.logR.data = rbind(segmented.logR.data,chr.segmented.logR.data) + segmented.logR.data[[chr]] = chr.segmented.logR.data } - names(matched.segmented.BAF.data)[5] = samplename - + # Sync the dataframes selection = c() for (chrom in chr.names) { - matched.segmented.BAF.data.chr = matched.segmented.BAF.data[matched.segmented.BAF.data[,1]==chrom,] - logR.data.chr = logR.data[logR.data[,1]==chrom,] - selection = c(selection, matched.segmented.BAF.data.chr[,2] %in% logR.data.chr[,2]) + matched.segmented.BAF.data.chr = matched.segmented.BAF.data[[chrom]] #matched.segmented.BAF.data[matched.segmented.BAF.data[,1]==chrom,] + logR.data.chr = logR.data[[chrom]] #logR.data[logR.data[,1]==chrom,] + + selection = matched.segmented.BAF.data.chr[,2] %in% logR.data.chr[,2] + matched.segmented.BAF.data[[chrom]] = matched.segmented.BAF.data.chr[selection,] + segmented.logR.data[[chrom]] = segmented.logR.data[[chrom]][selection,] } + + # Combine the split data frames into a single for the subsequent steps + matched.segmented.BAF.data = do.call(rbind, matched.segmented.BAF.data) + segmented.logR.data = do.call(rbind, segmented.logR.data) + BAF.data = do.call(rbind, BAF.data) + logR.data = do.call(rbind, logR.data) + names(matched.segmented.BAF.data)[5] = samplename - matched.segmented.BAF.data = matched.segmented.BAF.data[selection,] - segmented.logR.data = segmented.logR.data[selection,] + # write out the segmented logR data row.names(segmented.logR.data) = row.names(matched.segmented.BAF.data) row.names(logR.data) = row.names(matched.segmented.BAF.data) write.table(segmented.logR.data,paste(samplename,".logRsegmented.txt",sep=""),sep="\t",quote=F,col.names=F,row.names=F) + # Prepare the data for going into the runASCAT functions segBAF = 1-matched.segmented.BAF.data[,5] segLogR = segmented.logR.data[,3] logR = logR.data[,3] @@ -178,17 +191,17 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente #' @param chr_names Vector of allowed chromosome names #' @param masking_output_file Filename of where the masking details need to be written. Masking is performed to remove very high copy number state segments #' @param max_allowed_state The maximum CN state allowed (Default 100) -#' @param sv_breakpoints_file A two column file with breakpoints from structural variants. These are used when making the figures +#' @param prior_breakpoints_file A two column file with prior breakpoints, possibly from structural variants. This file must contain two columns: chromosome and position. These are used when making the figures #' @param gamma Technology specific scaling parameter for LogR (Default 1) #' @param segmentation.gamma Legacy parameter that is no longer used (Default NA) #' @param siglevel Threshold under which a p-value becomes significant. When it is significant a second copy number state will be fitted (Default 0.05) #' @param maxdist Slack in BAF space to allow a segment to be off it's optimum before becoming significant. A segment becomes significant very quickly when a breakpoint is missed, this parameter alleviates the effect (Default 0.01) #' @param noperms The number of permutations to be run when bootstrapping the confidence intervals on the copy number state of each segment (Default 1000) #' @param seed Seed to set when performing bootstrapping (Default: Current time) -#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1) +#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median==0|1, mean, median. (Default: 3) #' @author dw9, sd11 #' @export -callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, masking_output_file, max_allowed_state=250, sv_breakpoints_file=NULL, gamma=1, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, noperms=1000, seed=as.integer(Sys.time()), calc_seg_baf_option=1) { +callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, masking_output_file, max_allowed_state=250, prior_breakpoints_file=NULL, gamma=1, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, noperms=1000, seed=as.integer(Sys.time()), calc_seg_baf_option=3) { set.seed(seed) @@ -200,7 +213,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil goodness = res$goodness # Load the BAF segmented data - BAFvals = read.table(baf.segmented.file, sep="\t", header=T, stringsAsFactors=F) #, row.names=F + BAFvals = as.data.frame(read_bafsegmented(baf.segmented.file)) if (colnames(BAFvals)[1] == "X") { # If there were rownames, then delete this column. Should not be an issue with new BB runs BAFvals = BAFvals[,-1] @@ -214,8 +227,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil SNPpos = BAFvals[,c(1,2)] # Load the raw LogR data - # LogRvals = read.table(logr.file,sep="\t", header=T, stringsAsFactors=F) - LogRvals = as.data.frame(read_table_generic(logr.file)) + LogRvals = as.data.frame(read_logr(logr.file)) if (colnames(LogRvals)[1] == "X") { # If there were rownames, then delete this column. Should not be an issue with new BB runs LogRvals = LogRvals[,-1] @@ -234,8 +246,6 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil # = as.vector(ctrans.logR[as.vector(LogRvals[,1])]*1000000000+LogRvals[,2]) BAFpos = as.vector(ctrans[as.vector(BAFvals[,1])]*1000000000+BAFvals[,2]) - #save(file="subclones_temp.RData", BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) - ################################################################################################ # Determine copy number for each segment ################################################################################################ @@ -267,8 +277,8 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil ################################################################################################ # Collapse the BAFsegmented into breakpoints to be used in plotting segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals) - if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { - svs = read.table(sv_breakpoints_file, header=T, stringsAsFactors=F) + if (!is.null(prior_breakpoints_file) & !ifelse(is.null(prior_breakpoints_file), TRUE, prior_breakpoints_file=="NA") & !ifelse(is.null(prior_breakpoints_file), TRUE, is.na(prior_breakpoints_file))) { + svs = read.table(prior_breakpoints_file, header=T, stringsAsFactors=F) } # Create a plot per chromosome that shows the segments with their CN state in text @@ -277,7 +287,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil #if no points to plot, skip if (length(pos)==0) { next } - if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { + if (!is.null(prior_breakpoints_file) & !ifelse(is.null(prior_breakpoints_file), TRUE, prior_breakpoints_file=="NA") & !ifelse(is.null(prior_breakpoints_file), TRUE, is.na(prior_breakpoints_file))) { svs_pos = svs[svs$chromosome==chr,]$position / 1000000 } else { svs_pos = NULL @@ -513,13 +523,13 @@ determine_copynumber = function(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctra #' @param rho The rho estimate that the profile was fit with #' @param psi the psi estimate that the profile was fit with #' @param platform_gamma The gamma parameter for this platform -#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1) +#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median== 0|1, mean, median. (Default: 3) #' @return A list with two fields: bafsegmented and subclones. The subclones field contains a data.frame in #' Battenberg output format with the merged segments. The bafsegmented field contains the BAFsegmented data #' corresponding to the provided subclones data.frame. #' @author sd11 #' @noRd -merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamma, calc_seg_baf_option=1) { +merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamma, calc_seg_baf_option=3) { subclones_cleaned = data.frame() merged = T counter = 1 @@ -567,15 +577,33 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm new_entry = data.frame(subclones[i-1,]) new_entry$endpos = subclones[i,]$endpos + # + # Note: When adding options, also add to segment.baf.phased + # if (calc_seg_baf_option==1) { new_entry$BAF = median(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) - } else if (calc_seg_baf_option==2 | ! calc_seg_baf_option %in% c(1,2)) { + } else if (calc_seg_baf_option==2) { new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) - if (! calc_seg_baf_option %in% c(1,2)) { - warning("Supplied calc_seg_baf_option to callSubclones not valid, using mean BAF by default") + } else if (calc_seg_baf_option==3) { + mean_baf = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], + bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) + median_baf = median(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], + bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) + # We'll prefer the median BAF as a segment summary + # but change to the mean when the median is extreme + # as at 0 or 1 the BAF is uninformative for the fitting + if (median_baf!=0 & mean_baf!=1) { + new_entry$BAF = median_baf + } else { + new_entry$BAF = mean_baf } + } else { + warning("Supplied calc_seg_baf_option to callSubclones not valid, using mean BAF by default") + # The mean is taken here as that has originally been the default. This behaviour is consistent with the segmentation functions + new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], + bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) } new_entry$LogR = mean(c(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3], diff --git a/R/haplotype.R b/R/haplotype.R index 7531785..1e2850e 100644 --- a/R/haplotype.R +++ b/R/haplotype.R @@ -118,13 +118,21 @@ plot.haplotype.data = function(haplotyped.baf.file, imageFileName, samplename, c chr_name = chr_names[chrom] mut_data = read.table(haplotyped.baf.file,sep="\t",header=T) + if (nrow(mut_data) > 0) { + x_min = min(mut_data$Position,na.rm=T) + x_max = max(mut_data$Position,na.rm=T) + } else { + x_min = 1 + x_max = 2 + } + png(filename = imageFileName, width = 10000, height = 2500, res = 500) create.haplotype.plot(chrom.position=mut_data$Position, points.blue=mut_data[,3], points.red=1-mut_data[,3], - x.min=min(mut_data$Position,na.rm=T), - x.max=max(mut_data$Position,na.rm=T), - title=paste(samplename,", chromosome",mut_data[1,1], sep=" "), + x.min=x_min, + x.max=x_max, + title=paste(samplename,", chromosome",chr_name, sep=" "), xlab="pos", ylab="BAF") dev.off() diff --git a/R/impute.R b/R/impute.R index 29356a4..0d58f3b 100644 --- a/R/impute.R +++ b/R/impute.R @@ -73,6 +73,17 @@ parse.imputeinfofile = function(imputeinfofile, is.male, chrom=NA) { return(impute.info) } +#' Check impute info file consistency +#' @param imputeinfofile Path to the imputeinfofile on disk. +#' @author sd11 +check.imputeinfofile = function(imputeinfofile, is.male) { + impute.info = parse.imputeinfofile(imputeinfofile, is.male) + if (any(!file.exists(impute.info$impute_legend) | !file.exists(impute.info$genetic_map) | !file.exists(impute.info$impute_hap))) { + print("Could not find reference files, make sure paths in impute_info.txt point to the correct location") + stop("Could not find reference files, make sure paths in impute_info.txt point to the correct location") + } +} + #' Returns the chromosome names that are supported #' @param imputeinfofile Path to the imputeinfofile on disk. #' @param is.male A boolean describing whether the sample under study is male. @@ -204,5 +215,5 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile chr_names=chrom_names) # Cleanup temp Impute output - unlink(paste(tumourname, "_impute_output_chr", chrom, "*K.txt*", sep="")) + unlink(paste(tumourname, "_impute_output_chr", chrom, ".txt*K.txt*", sep="")) } diff --git a/R/plotting.R b/R/plotting.R index fb3c947..13e1ae2 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -4,21 +4,23 @@ create.haplotype.plot = function(chrom.position, points.blue, points.red, x.min, x.max, title, xlab, ylab) { par(pch=".", cex=1, cex.main=0.8, cex.axis = 0.6, cex.lab=0.7,yaxp=c(-0.05,1.05,6)) plot(c(x.min,x.max), c(0,1), type="n", main=title, xlab=xlab, ylab=ylab) - points(chrom.position, points.blue, col="blue") - points(chrom.position, points.red, col="red") + if (length(chrom.position) > 0) { + points(chrom.position, points.blue, col="blue") + points(chrom.position, points.red, col="red") + } } #' Function that plots two types of data points against it's chromosomal location. #' Note: This is a plot PER chromosome. #' @noRd -create.segmented.plot = function(chrom.position, points.red, points.green, x.min, x.max, title, xlab, ylab, svs_pos=NULL) { +create.segmented.plot = function(chrom.position, points.red, points.green, x.min, x.max, title, xlab, ylab, prior_bkps_pos=NULL) { par(mar = c(5,5,5,0.5), cex = 0.4, cex.main=3, cex.axis = 2, cex.lab = 2) plot(c(x.min,x.max), c(0,1), pch=".", type="n", main=title, xlab=xlab, ylab=ylab) points(chrom.position, points.red, pch=".", col="red", cex=2) points(chrom.position, points.green, pch=19, cex=0.5, col="green") - if (!is.null(svs_pos)) { - for (i in 1:length(svs_pos)) { - abline(v=svs_pos[i]) + if (!is.null(prior_bkps_pos)) { + for (i in 1:length(prior_bkps_pos)) { + abline(v=prior_bkps_pos[i]) } } } @@ -26,15 +28,15 @@ create.segmented.plot = function(chrom.position, points.red, points.green, x.min #' Function that plots two types of data points against it's chromosomal location. #' Note: This is a plot PER chromosome. #' @noRd -create.baf.plot = function(chrom.position, points.red.blue, plot.red, points.darkred, points.darkblue, x.min, x.max, title, xlab, ylab, svs_pos=NULL) { +create.baf.plot = function(chrom.position, points.red.blue, plot.red, points.darkred, points.darkblue, x.min, x.max, title, xlab, ylab, prior_bkps_pos=NULL) { par(mar = c(5,5,5,0.5), cex = 0.4, cex.main=3, cex.axis = 2, cex.lab = 2) plot(c(x.min,x.max), c(0,1), pch=".", type = "n", main=title, xlab=xlab, ylab=ylab) points(chrom.position, points.red.blue, pch=".", col=ifelse(plot.red, "red", "blue"), cex=2) points(chrom.position, points.darkred, pch=19, cex=0.5, col="darkred") points(chrom.position, points.darkblue, pch=19, cex=0.5, col="darkblue") - if (!is.null(svs_pos)) { - for (i in 1:length(svs_pos)) { - abline(v=svs_pos[i]) + if (!is.null(prior_bkps_pos)) { + for (i in 1:length(prior_bkps_pos)) { + abline(v=prior_bkps_pos[i]) } } } @@ -366,6 +368,11 @@ totalcn_chrom_plot = function(samplename, subclones, logr, outputfile, purity) { max_cn_plot = ifelse(max_cn_plot_fit > max_cn_plot_data, max_cn_plot_fit, max_cn_plot_data) maxpos = max(logr$Position) + # catch case when there is no clonal CNA called + if (is.na(max_cn_plot) | max_cn_plot < 4) { + max_cn_plot = 4 + } + # These are the grey lines in the background background = data.frame(xmin=rep(0, (max_cn_plot/2)+1), xmax=rep(max(logr$Position), (max_cn_plot/2)+1), diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 7db9779..02dcc12 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -16,6 +16,12 @@ getAlleleCounts = function(bam.file, output.file, g1000.loci, min.base.qual=20, "-o", output.file, "-m", min.base.qual, "-q", min.map.qual) + + + # alleleCount >= v4.0.0 is sped up considerably on 1000G loci when run in dense-snp mode + counter_version = system(paste(allelecounter.exe, "--version"), intern = T) + if (as.integer(substr(x = counter_version, start = 1, stop = 1)) >= 4) + cmd = paste(cmd, "--dense-snps") system(cmd, wait=T) } @@ -243,83 +249,123 @@ generate.impute.input.wgs = function(chrom, tumour.allele.counts.file, normal.al #' @param gc_content_file_prefix String pointing to where GC windows for this reference genome can be #' found. These files should be split per chromosome and this prefix must contain the full path until #' chr in its name. The .txt extension is automatically added. +#' @param replic_timing_file_prefix Like the gc_content_file_prefix, containing replication timing info (supply NULL if no replication timing correction is to be applied) #' @param chrom_names A vector containing chromosome names to be considered -#' @param recalc_corr_afterwards Set to TRUE to recalculate correlations with GC content after correction -#' @author sd11 +#' @param recalc_corr_afterwards Set to TRUE to recalculate correlations after correction +#' @author jonas demeulemeester, sd11 #' @export -gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_content_file_prefix, chrom_names, recalc_corr_afterwards=F) { +gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_content_file_prefix, replic_timing_file_prefix, chrom_names, recalc_corr_afterwards=F) { + + if (is.null(gc_content_file_prefix)) { + stop("GC content reference files must be supplied to WGS GC content correction") + } + + Tumor_LogR = read_logr(Tumour_LogR_file) + + print("Processing GC content data") + chrom_idx = 1:length(chrom_names) + gc_files = paste0(gc_content_file_prefix, chrom_idx, ".txt.gz") + GC_data = do.call(rbind, lapply(gc_files, read_gccontent)) + colnames(GC_data) = c("chr", "Position", paste0(c(25,50,100,200,500), "bp"), + paste0(c(1,2,5,10,20,50,100), "kb"))#,200,500), "kb"), + # paste0(c(1,2,5,10), "Mb")) - Tumor_LogR = read.table(Tumour_LogR_file, stringsAsFactors=F, header=T) + if (!is.null(replic_timing_file_prefix)) { + print("Processing replciation timing data") + replic_files = paste0(replic_timing_file_prefix, chrom_idx, ".txt.gz") + replic_data = do.call(rbind, lapply(replic_files, read_replication)) + } + + # omit non-matching loci, replication data generated at exactly same GC loci + locimatches = match(x = paste0(Tumor_LogR$Chromosome, "_", Tumor_LogR$Position), + table = paste0(GC_data$chr, "_", GC_data$Position)) + Tumor_LogR = Tumor_LogR[which(!is.na(locimatches)), ] + GC_data = GC_data[na.omit(locimatches), ] + if (!is.null(replic_timing_file_prefix)) { + replic_data = replic_data[na.omit(locimatches), ] + } + rm(locimatches) + + corr = abs(cor(GC_data[, 3:ncol(GC_data)], Tumor_LogR[,3], use="complete.obs")[,1]) + if (!is.null(replic_timing_file_prefix)) { + corr_rep = abs(cor(replic_data[, 3:ncol(replic_data)], Tumor_LogR[,3], use="complete.obs")[,1]) + } + + index_1kb = which(names(corr)=="1kb") + maxGCcol_insert = names(which.max(corr[1:index_1kb])) + index_100kb = which(names(corr)=="100kb") + # start large window sizes at 5kb rather than 2kb to avoid overly correlated expl variables + maxGCcol_amplic = names(which.max(corr[(index_1kb+2):index_100kb])) + if (!is.null(replic_timing_file_prefix)) { + maxreplic = names(which.max(corr_rep)) + } + + if (!is.null(replic_timing_file_prefix)) { + cat("Replication timing correlation: ",paste(names(corr_rep),format(corr_rep,digits=2), ";"),"\n") + cat("Replication dataset: " ,maxreplic,"\n") + } + cat("GC correlation: ",paste(names(corr),format(corr,digits=2), ";"),"\n") + cat("Short window size: ",maxGCcol_insert,"\n") + cat("Long window size: ",maxGCcol_amplic,"\n") + + if (!is.null(replic_timing_file_prefix)) { + # Multiple regression - with replication timing + corrdata = data.frame(logr = Tumor_LogR[,3, drop = T], + GC_insert = GC_data[,maxGCcol_insert, drop = T], + GC_amplic = GC_data[,maxGCcol_amplic, drop = T], + replic = replic_data[, maxreplic, drop = T]) + colnames(corrdata) = c("logr", "GC_insert", "GC_amplic", "replic") + if (!recalc_corr_afterwards) + rm(GC_data, replic_data) - print("Processing GC content data") - GC_data = list() - Tumor_LogR_new = list() - for (chrindex in 1:length(chrom_names)) { - chrom = chrom_names[chrindex] - print(paste("chr =", chrindex)) - Tumor_LogR_chr = Tumor_LogR[Tumor_LogR$Chromosome==chrom,] - GC_newlist = read.table(paste(gc_content_file_prefix, chrindex, ".txt.gz", sep=""), header=T, stringsAsFactors=F) - colnames(GC_newlist)[c(1,2)] = c("Chr","Position") - GC_newlist = GC_newlist[GC_newlist$Position %in% Tumor_LogR_chr$Position,] - GC_data[[length(GC_data)+1]] = GC_newlist - Tumor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[!is.na(match(Tumor_LogR_chr$Position, GC_newlist$Position)),] + model = lm(logr ~ splines::ns(x = GC_insert, df = 5, intercept = T) + splines::ns(x = GC_amplic, df = 5, intercept = T) + splines::ns(x = replic, df = 5, intercept = T), y=F, model = F, data = corrdata, na.action="na.exclude") + + corr = data.frame(windowsize=c(names(corr), names(corr_rep)), correlation=c(corr, corr_rep)) + write.table(corr, file=gsub(".txt", "_beforeCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) + + } else { + # Multiple regression - without replication timing + corrdata = data.frame(logr = Tumor_LogR[,3, drop = T], + GC_insert = GC_data[,maxGCcol_insert, drop = T], + GC_amplic = GC_data[,maxGCcol_amplic, drop = T]) + colnames(corrdata) = c("logr", "GC_insert", "GC_amplic") + if (!recalc_corr_afterwards) + rm(GC_data) + + model = lm(logr ~ splines::ns(x = GC_insert, df = 5, intercept = T) + splines::ns(x = GC_amplic, df = 5, intercept = T), y=F, model = F, data = corrdata, na.action="na.exclude") + + corr = data.frame(windowsize=names(corr), correlation=corr) + write.table(corr, file=gsub(".txt", "_beforeCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) } - Tumor_LogR = do.call(rbind, Tumor_LogR_new) - rm(Tumor_LogR_new) - GC_data = do.call(rbind, GC_data) - - flag_nona = is.finite(Tumor_LogR[,3]) - corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs") - length = nrow(Tumor_LogR) - - corr = apply(corr, 1, function(x) sum(abs(x*length))/sum(length)) - index_1M = c(which(names(corr)=="X1M"), which(names(corr)=="X1Mb")) - maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)]))[1] - maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)]))[1] - maxGCcol_long = maxGCcol_long+(index_1M-1) - - cat("weighted correlation: ",paste(names(corr),format(corr,digits=2), ";"),"\n") - cat("Short window size: ",names(GC_data)[maxGCcol_short+2],"\n") - cat("Long window size: ",names(GC_data)[maxGCcol_long+2],"\n") - - # Multiple regression - flag_NA = (is.infinite(Tumor_LogR[,3])) | (is.na(GC_data[,2+maxGCcol_short])) | (is.na(GC_data[,2+maxGCcol_long])) - td_select = Tumor_LogR[!flag_NA,3] - GC_data_select = GC_data[!flag_NA,] - x1 = GC_data_select[,2+maxGCcol_short] - x2 = GC_data_select[,2+maxGCcol_long] - x3 = (x1)^2 - x4 = (x2)^2 - model = lm(td_select ~ x1+x2+x3+x4, y=T, na.action="na.omit") - - GCcorrected = Tumor_LogR[!flag_NA,] - GCcorrected[,3] = model$residuals - rm(Tumor_LogR) - - corr = data.frame(windowsize=names(corr), correlation=corr) - write.table(corr, file=gsub(".txt", "_beforeCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) - write.table(GCcorrected, file=outfile, sep="\t", quote=F, row.names=F) + + Tumor_LogR[,3] = residuals(model) + rm(model, corrdata) + + readr::write_tsv(x=Tumor_LogR[which(!is.na(Tumor_LogR[,3])), ], path=outfile) if (recalc_corr_afterwards) { - # Recalculate the correlations to see how much there is left - snps_gccorrected = paste(GCcorrected[,1], GCcorrected[,2], sep="_") - snps_gcdata = paste(GC_data[,1], GC_data[,2], sep="_") - snps_intersect = intersect(snps_gccorrected, snps_gcdata) - GCcorrected = GCcorrected[snps_gccorrected %in% snps_intersect, ] - GC_data = GC_data[snps_gcdata %in% snps_intersect, ] - - flag_nona = is.finite(GCcorrected[,3]) - corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], GCcorrected[flag_nona,3], use="complete.obs") - length = nrow(GCcorrected) - corr = apply(corr, 1, function(x) sum(abs(x*length))/sum(length)) - corr = data.frame(windowsize=names(corr), correlation=corr) - write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) + # Recalculate the correlations to see how much there is left + corr = abs(cor(GC_data[, 3:ncol(GC_data)], Tumor_LogR[,3], use="complete.obs")[,1]) + if (!is.null(replic_timing_file_prefix)) { + corr_rep = abs(cor(replic_data[, 3:ncol(replic_data)], Tumor_LogR[,3], use="complete.obs")[,1]) + cat("Replication timing correlation post correction: ",paste(names(corr_rep),format(corr_rep,digits=2), ";"),"\n") + } + cat("GC correlation post correction: ",paste(names(corr),format(corr,digits=2), ";"),"\n") + + if (!is.null(replic_timing_file_prefix)) { + corr = data.frame(windowsize=c(names(corr), names(corr_rep)), correlation=c(corr, corr_rep)) + write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) + } else { + corr = data.frame(windowsize=c(names(corr)), correlation=corr) + write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) + } } else { - corr = data.frame(windowsize=names(corr), correlation=rep(NA, length(corr))) - write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) + corr$correlation = NA + write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) } } + #' Prepare WGS data for haplotype construction #' #' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR @@ -333,6 +379,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co #' @param g1000allelesprefix Prefix path to the 1000 Genomes alleles reference files #' @param g1000prefix Prefix path to the 1000 Genomes SNP reference files #' @param gccorrectprefix Prefix path to GC content reference data +#' @param repliccorrectprefix Prefix path to replication timing reference data (supply NULL if no replication timing correction is to be applied) #' @param min_base_qual Minimum base quality required for a read to be counted #' @param min_map_qual Minimum mapping quality required for a read to be counted #' @param allelecounter_exe Path to the allele counter executable (can be found in $PATH) @@ -342,7 +389,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co #' @author sd11 #' @export prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, - min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) { + repliccorrectprefix, min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) { requireNamespace("foreach") requireNamespace("doParallel") @@ -385,5 +432,6 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname outfile=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), correlations_outfile=paste(tumourname, "_GCwindowCorrelations.txt", sep=""), gc_content_file_prefix=gccorrectprefix, + replic_timing_file_prefix=repliccorrectprefix, chrom_names=chrom_names) } diff --git a/R/segmentation.R b/R/segmentation.R index 41cc7e0..65e8189 100644 --- a/R/segmentation.R +++ b/R/segmentation.R @@ -23,8 +23,7 @@ adjustSegmValues = function(baf_chrom) { return(baf_chrom) } - -#' Segment the haplotyped and phased data using fastPCF. +#' Segment the haplotyped and phased data using fastPCF. This is the legacy segmentation function as it was used in the original Battenberg versions #' #' This function performs segmentation. This is done in two steps. First a segmentation step #' that aims to find short segments. These are used to find haplotype blocks that have been @@ -42,8 +41,30 @@ adjustSegmValues = function(baf_chrom) { #' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1) #' @author dw9 #' @export -segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phasegamma=3, kmin=3, phasekmin=3, calc_seg_baf_option=1) { - BAFraw = read.table(inputfile,sep="\t",header=T, stringsAsFactors=F) +segment.baf.phased.legacy = function(samplename, inputfile, outputfile, gamma=10, phasegamma=3, kmin=3, phasekmin=3, calc_seg_baf_option=1) { + + +} + +#' Segment the haplotyped and phased data using fastPCF. This is the legacy segmentation function as it was used in the original Battenberg versions +#' +#' This function performs segmentation. This is done in two steps. First a segmentation step +#' that aims to find short segments. These are used to find haplotype blocks that have been +#' switched. These blocks are switched into the correct order first after which the second +#' segmentation step is performed. This second step aims to segment the data that will go into +#' fit.copy.number. This function produces a BAF segmented file with 5 columns: chromosome, position, +#' original BAF, switched BAF and BAF segment. The BAF segment column should be used subsequently +#' @param samplename Name of the sample, which is used to name output figures +#' @param inputfile String that points to the output from the \code{combine.baf.files} function. This contains the phased SNPs with their BAF values +#' @param outputfile String where the segmentation output will be written +#' @param gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10) +#' @param kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3) +#' @param phasegamma Gamma parameter used when correcting phasing mistakes (Default: 3) +#' @param phasekmin Kmin parameter used when correcting phasing mistakes (Default: 3) +#' @author dw9 +#' @export +segment.baf.phased.legacy = function(samplename, inputfile, outputfile, gamma=10, phasegamma=3, kmin=3, phasekmin=3) { + BAFraw = as.data.frame(read_baf(inputfile)) BAFoutput = NULL for (chr in unique(BAFraw[,1])) { @@ -95,16 +116,6 @@ segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phase BAFphseg = res$yhat } - # Recalculate the BAF of each segment, if required - if (calc_seg_baf_option==1) { - # Adjust the segment BAF to not take the mean as that is sensitive to improperly phased segments - BAFphseg = adjustSegmValues(data.frame(BAFphased=BAFphased, BAFseg=BAFphseg))$BAFseg - } else if (calc_seg_baf_option==2) { - # Don't do anything, the BAF is already the mean - } else { - warning("Supplied calc_seg_baf_option to segment.baf.phased not valid, using mean BAF by default") - } - png(filename = paste(samplename,"_segment_chr",chr,".png",sep=""), width = 2000, height = 1000, res = 200) create.baf.plot(chrom.position=pos/1000000, points.red.blue=BAF, @@ -126,7 +137,7 @@ segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phase write.table(BAFoutput, outputfile, sep="\t", row.names=F, col.names=T, quote=F) } -#' Segment BAF with the inclusion of structural variant breakpoints +#' Segment BAF with the inclusion of structural variant breakpoints - This function is now deprecated, call segment.baf.phased instead #' #' This function takes the SV breakpoints as initial segments and runs PCF on each #' of those independently. The SVs must be supplied as a simple data.frame with columns @@ -134,7 +145,7 @@ segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phase #' @param samplename Name of the sample, which is used to name output figures #' @param inputfile String that points to the output from the \code{combine.baf.files} function. This contains the phased SNPs with their BAF values #' @param outputfile String where the segmentation output will be written -#' @param svs Data.frame with chromosome and position columns +#' @param svs Data.frame with chromosome and position columns (Default: NULL) #' @param gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default 10) #' @param kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default 3) #' @param phasegamma Gamma parameter used when correcting phasing mistakes (Default 3) @@ -143,7 +154,28 @@ segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phase #' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1) #' @author sd11 #' @export -segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=10, phasegamma=3, kmin=3, phasekmin=3, no_segmentation=F, calc_seg_baf_option=1) { +segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs=NULL, gamma=10, phasegamma=3, kmin=3, phasekmin=3, no_segmentation=F, calc_seg_baf_option=1) { + .Deprecated("segment.baf.phased") + print("Stopping now") +} + +#' Segment BAF, with the possible inclusion of structural variant breakpoints +#' +#' This function breaks the genome up into chromosomes, possibly further when SV breakpoints +#' are provided, and runs PCF on each to segment the chromosomes independently. +#' @param samplename Name of the sample, which is used to name output figures +#' @param inputfile String that points to the output from the \code{combine.baf.files} function. This contains the phased SNPs with their BAF values +#' @param outputfile String where the segmentation output will be written +#' @param prior_breakpoints_file String that points to a file with prior breakpoints (from SVs for example) with chromosome and position columns (Default: NULL) +#' @param gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default 10) +#' @param kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default 3) +#' @param phasegamma Gamma parameter used when correcting phasing mistakes (Default 3) +#' @param phasekmin Kmin parameter used when correcting phasing mistakes (Default 3) +#' @param no_segmentation Do not perform segmentation. This step will switch the haplotype blocks, but then just takes the mean BAFphased as BAFsegm +#' @param calc_seg_baf_option Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median==0 or 1, median, mean. (Default: 3) +#' @author sd11 +#' @export +segment.baf.phased = function(samplename, inputfile, outputfile, prior_breakpoints_file=NULL, gamma=10, phasegamma=3, kmin=3, phasekmin=3, no_segmentation=F, calc_seg_baf_option=3) { # Function that takes SNPs that belong to a single segment and looks for big holes between # each pair of SNPs. If there is a big hole it will add another breakpoint to the breakpoints data.frame addin_bigholes = function(breakpoints, positions, chrom, startpos, maxsnpdist) { @@ -162,30 +194,30 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 } # Helper function that creates segment breakpoints from SV calls - # @param svs_chrom Structural variant breakpoints for a single chromosome + # @param bkps_chrom Breakpoints for a single chromosome # @param BAFrawchr Raw BAF values of germline heterozygous SNPs on a single chromosome # @param addin_bigholes Flag whether bog holes in data are to be added as breakpoints # @return A data.frame with chrom, start and end columns # @author sd11 - svs_to_presegment_breakpoints = function(chrom, svs_chrom, BAFrawchr, addin_bigholes) { + bkps_to_presegment_breakpoints = function(chrom, bkps_chrom, BAFrawchr, addin_bigholes) { maxsnpdist = 3000000 - svs_breakpoints = svs_chrom$position + bkps_breakpoints = bkps_chrom$position - # If there are no SVs we cannot insert any breakpoints - if (length(svs_breakpoints) > 0) { + # If there are no prior breakpoints, we cannot insert any + if (length(bkps_breakpoints) > 0) { breakpoints = data.frame() # check which comes first, the breakpoint or the first SNP - if (BAFrawchr$Position[1] < svs_breakpoints[1]) { + if (BAFrawchr$Position[1] < bkps_breakpoints[1]) { startpos = BAFrawchr$Position[1] startfromsv = 1 # We're starting from SNP data, so the first SV should be added first } else { - startpos = svs_breakpoints[1] + startpos = bkps_breakpoints[1] startfromsv = 2 # We've just added the first SV, don't use it again } - for (svposition in svs_breakpoints[startfromsv:length(svs_breakpoints)]) { + for (svposition in bkps_breakpoints[startfromsv:length(bkps_breakpoints)]) { selectedsnps = BAFrawchr$Position >= startpos & BAFrawchr$Position <= svposition if (sum(selectedsnps, na.rm=T) > 0) { @@ -204,13 +236,13 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 } # Add the remainder of the chromosome, if available - if (BAFrawchr$Position[nrow(BAFrawchr)] > svs_breakpoints[length(svs_breakpoints)]) { + if (BAFrawchr$Position[nrow(BAFrawchr)] > bkps_breakpoints[length(bkps_breakpoints)]) { endindex = nrow(BAFrawchr) breakpoints = rbind(breakpoints, data.frame(chrom=chrom, start=startpos, end=BAFrawchr$Position[endindex])) } } else { # There are no SVs, so create one big segment - print("No SVs") + print("No prior breakpoints found") startpos = BAFrawchr$Position[1] breakpoints = data.frame() @@ -275,14 +307,29 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 } if (length(BAF) > 0) { + + # + # Note: When adding options, also add to merge_segments + # + # Recalculate the BAF of each segment, if required if (calc_seg_baf_option==1) { # Adjust the segment BAF to not take the mean as that is sensitive to improperly phased segments BAFphseg = adjustSegmValues(data.frame(BAFphased=BAFphased, BAFseg=BAFphseg))$BAFseg } else if (calc_seg_baf_option==2) { # Don't do anything, the BAF is already the mean + } else if (calc_seg_baf_option==3) { + # Take the median, unless the median is exactly 0 or 1. At the extreme + # there is no difference between lets say 40 and 41 copies and BB cannot + # fit a copy number state. The mean is less prone to become exactly 0 or 1 + # but the median is generally a better estimate that is less sensitive to + # how well the haplotypes have been reconstructed + BAFphseg_median = adjustSegmValues(data.frame(BAFphased=BAFphased, BAFseg=BAFphseg))$BAFseg + if (BAFphseg_median!=0 & BAFphseg_median!=1) { + BAFphseg = BAFphseg_median + } } else { - warning("Supplied calc_seg_baf_option to segment.baf.phased.sv not valid, using mean BAF by default") + warning("Supplied calc_seg_baf_option to segment.baf.phased not valid, using mean BAF by default") } } @@ -294,8 +341,8 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 tempBAFsegm=BAFsegm)) # Keep track of BAFsegm for the plot below } - # bafsegments, breakpoints, kmin, gamma_param, samplename, filename_suffix="jabba_aspcf" - BAFraw = read.table(inputfile,sep="\t",header=T, stringsAsFactors=F) + BAFraw = as.data.frame(read_baf(inputfile)) + if (!is.null(prior_breakpoints_file)) { bkps = read.table(prior_breakpoints_file, header=T, stringsAsFactors=F) } else { bkps = NULL } BAFoutput = NULL for (chr in unique(BAFraw[,1])) { @@ -303,9 +350,13 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 BAFrawchr = BAFraw[BAFraw[,1]==chr,c(2,3)] # BAFrawchr = bafsegments[bafsegments$Chromosome==chr, c(2,3)] BAFrawchr = BAFrawchr[!is.na(BAFrawchr[,2]),] - svs_chrom = svs[svs$chromosome==chr,] + if (!is.null(bkps)) { + bkps_chrom = bkps[bkps$chromosome==chr,] + } else { + bkps_chrom = data.frame(chromosome=character(), position=numeric()) + } - breakpoints_chrom = svs_to_presegment_breakpoints(chr, svs_chrom, BAFrawchr, addin_bigholes=T) + breakpoints_chrom = bkps_to_presegment_breakpoints(chr, bkps_chrom, BAFrawchr, addin_bigholes=T) BAFoutputchr = NULL for (r in 1:nrow(breakpoints_chrom)) { @@ -322,7 +373,7 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 title=paste(samplename,", chromosome ", chr, sep=""), xlab="Position (Mb)", ylab="BAF (phased)", - svs_pos=svs_chrom$position/1000000) + prior_bkps_pos=bkps_chrom$position/1000000) dev.off() png(filename = paste(samplename,"_segment_chr",chr,".png",sep=""), width = 2000, height = 1000, res = 200) @@ -336,7 +387,7 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 title=paste(samplename,", chromosome ", chr, sep=""), xlab="Position (Mb)", ylab="BAF (phased)", - svs_pos=svs_chrom$position/1000000) + prior_bkps_pos=bkps_chrom$position/1000000) dev.off() BAFoutputchr$BAFphased = ifelse(BAFoutputchr$tempBAFsegm>0.5, BAFoutputchr$BAF, 1-BAFoutputchr$BAF) diff --git a/R/util.R b/R/util.R index be77422..f9fb169 100644 --- a/R/util.R +++ b/R/util.R @@ -15,7 +15,7 @@ read_table_generic = function(file, header=T, row.names=F, stringsAsFactor=F, se # stringsAsFactor is not needed here, but kept for legacy purposes # Read in first line to obtain the header - d = readr::read_delim(file=file, delim=sep, col_names=header, n_max=1, skip=skip) + d = readr::read_delim(file=file, delim=sep, col_names=header, n_max=1, skip=skip, col_types = readr::cols()) # fetch the name of the first column to set its col_type for reading in the whole file # this is needed as readr does not understand the chromosome column properly @@ -36,6 +36,44 @@ read_table_generic = function(file, header=T, row.names=F, stringsAsFactor=F, se return(d) } +#' Parser for logR data +#' @param filename Filename of the file to read in +#' @param header Whether the file contains a header (Default: TRUE) +#' @return A data frame with logR content +read_logr = function(filename, header=T) { + return(readr::read_tsv(file = filename, col_names = header, col_types = "cin")) +} + +#' Parser for BAF data +#' @param filename Filename of the file to read in +#' @param header Whether the file contains a header (Default: TRUE) +#' @return A data frame with BAF content +read_baf = function(filename, header=T) { + return(readr::read_tsv(file = filename, col_names = header, col_types = "cin")) +} + +#' Parser for GC content reference data +#' @param filename Filename of the file to read in +#' @return A data frame with GC content +read_gccontent = function(filename) { + return(readr::read_tsv(file=filename, skip = 1, col_names = F, col_types = "-cinnnnnnnnnnnn------")) +} + +#' Parser for replication timing reference data +#' @param filename Filename of the file to read in +#' @return A data frame with replication timing +read_replication = function(filename) { + return(readr::read_tsv(file=filename, col_types = paste0("ci", paste0(rep("n", 15), collapse = "")))) +} + +#' Parser for BAFsegmented data +#' @param filename Filename of the file to read in +#' @param header Whether the file contains a header (Default: TRUE) +#' @return A data frame with BAFsegmented content +read_bafsegmented = function(filename, header=T) { + return(readr::read_tsv(file = filename, col_names = header, col_types = "cinnn")) +} + ######################################################################################## # Concatenate files ######################################################################################## @@ -222,6 +260,10 @@ cnfit_to_refit_suggestions = function(samplename, subclones_file, rho_psi_file, is_subclonal = subclones$frac1_A < 1 subclones_clonal_cna = subset(subclones, !is_subclonal & subclones$is_cna) subclones_clonal_cna = subclones_clonal_cna[with(subclones_clonal_cna, order(len, decreasing=T)),] + + if (nrow(subclones_clonal_cna)==0) { + output = data.frame(project=NA, samplename=samplename, qc=NA, cellularity_refit=T, chrom=NA, pos=NA, maj=NA, min=NA, baf=NA, logr=NA, rho_estimate=NA, psi_t_estimate=NA, rho_diff=NA, psi_t_diff=NA) + } else { # Generate a couple of solutions, but not more than are possibly available max_solutions = ifelse(nrow(subclones_clonal_cna) >= 5, 5, nrow(subclones_clonal_cna)) @@ -251,7 +293,7 @@ cnfit_to_refit_suggestions = function(samplename, subclones_file, rho_psi_file, output$psi_t_estimate = res$psi_t output$rho_diff = abs(rho-output$rho_estimate) output$psi_t_diff = abs(psi_t-output$psi_t_estimate) - + } } else { # No large clonal alteration, save a suggestion that should use an external purity value output = data.frame(project=NA, samplename=samplename, qc=NA, cellularity_refit=T, chrom=NA, pos=NA, maj=NA, min=NA, baf=NA, logr=NA, rho_estimate=NA, psi_t_estimate=NA, rho_diff=NA, psi_t_diff=NA) diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..e932213 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,3 @@ +.onLoad <- function(libname, pkgname) { + options(scipen = 999) +} diff --git a/README.md b/README.md old mode 100644 new mode 100755 index b6c9d70..366dedf --- a/README.md +++ b/README.md @@ -1,23 +1,17 @@ # Battenberg ------ -## Advised installation and running instructions +This repository contains code for the whole genome sequencing subclonal copy number caller Battenberg, as described in [Nik-Zainal, Van Loo, Wedge, et al. (2012), Cell](https://www.ncbi.nlm.nih.gov/pubmed/22608083). -Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg), download the code from there and run the setup.sh script. The battenberg.pl script can then be used to run the pipeline. +## Installation instructions - -## Advanced installation instructions - -The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg. - -### Standalone +The instructions below will install the latest stable Battenberg version. #### Prerequisites -Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run: +Installing from Github requires devtools and Battenberg requires readr, splines, RColorBrewer and ASCAT, while the pipeline requires parallel and doParallel. From the command line run: ``` -R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));' +R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "splines", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools", "parallel"));' R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' ``` @@ -31,33 +25,103 @@ R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' #### Required reference files -Battenberg requires a number of reference files that should be downloaded. +Battenberg requires reference files, for now for GRCh37 only, that can be downloaded from here: https://ora.ox.ac.uk/objects/uuid:2c1fec09-a504-49ab-9ce9-3f17bac531bc + +The bundle contains the following files: - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_1000genomesloci2012_v3.tar.gz - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only) - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_ref.tgz (SNP6 only) + * battenberg_1000genomesloci2012_v3.tar.gz + * battenberg_impute_1000G_v3.tar.gz + * probloci_270415.txt.gz + * battenberg_wgs_gc_correction_1000g_v3.tar.gz + * battenberg_wgs_replic_correction_1000g_v3.tar.gz + * battenberg_snp6_exe.tgz (SNP6 only) + * battenberg_snp6_ref.tgz (SNP6 only) #### Pipeline -Go into inst/example for example WGS and SNP6 R-only pipelines. - -### Docker - experimental +Go into ```inst/example``` for example WGS and SNP6 R-only pipelines. + +## Description of the output + +### Key output files + +* `[samplename]_subclones.txt` contains the copy number data (see table below) +* `[samplename]_rho_and_psi.txt` contains the purity estimate (make sure to use the FRAC_genome, rho field in the second row, first column) +* `[samplename]_BattenbergProfile*png` shows the profile (the two variants show subclonal copy number in a different way) +* `[samplename]_subclones_chr*.png` show detailed figures of the copy number calls per chromosome +* `[samplename]_distance.png` This shows the purity and ploidy solution space and can be used to pick alternative solutions + +The copy number profile saved in the `[samplename]_subclones.txt` is a tab delimited file in text format. Within this file there is a line for each segment in the tumour genome. +Each segment will have either one or two copy number states: + +* If there is one state that line represents the clonal copy number (i.e. all tumour cells have this state) +* If there are two states that line represents subclonal copy number (i.e. there are two populations of cells, each with a different state) + +A copy number state consists of a major and a minor allele and their frequencies, which together add give the total copy number for that segment and an estimate fraction of tumour cells that carry each allele. + +The following columns are available in the Battenberg output: + +| Column | Description | +| ------------- | ------------- | +| chr | The chromosome of the segment | +| startpos | Start position on the chromosome | +| endpos | End position on the chromosome | +| BAF | The B-allele frequency of the segment | +| pval | P-value that is obtained when testing whether this segment should be represented by one or two states. A low p-value will result in the fitting of a second copy number state | +| LogR | The log ratio of normalised tumour coverage versus its matched normal sequencing sample | +| ntot | An internal total copy number value used to determine the priority of solutions. NOTE: This is not the total copy number of this segment! | +| nMaj1_A | The major allele copy number of state 1 from solution A | +| nMin1_A | The minor allele copy number of state 1 from solution A | +| frac1_A | Fraction of tumour cells carrying state 1 in solution A | +| nMaj2_A | The major allele copy number of state 2 from solution A. This value can be NA | +| nMin2_A | The minor allele copy number of state 2 from solution A. This value can be NA | +| frac2_A | Fraction of tumour cells carrying state 2 in solution A. This value can be NA | +| SDfrac_A | Standard deviation on the BAF of SNPs in this segment, can be used as a measure of uncertainty | +| SDfrac_A_BS | Bootstrapped standard deviation | +| frac1_A_0.025 | Associated 95% confidence interval of the bootstrap measure of uncertainty | + +Followed by possible equivalent solutions B to F with the same columns as defined above for solution A (due to the way a profile is fit Battenberg can generate a series of equivalent solutions that are reported separately in the output). + +### Plots for QC + +It also produces a number plots that show the raw data and are useful for QC (and their raw data files denoted by *.tab) + +* `[samplename].tumour.png` and `[samplename].germline.png` show the raw BAF and logR +* `[samplename]_coverage.png` contains coverage divided by the mean coverage of both tumour and normal +* `[samplename]_alleleratio.png` shows BAF*logR, a rough approximation of what the data looks like shortly before copy number calling + +### Intermediate figures + +Finally, a range of plots show intermediate steps and can occasionally be useful + +* `[samplename]_chr*_heterozygousData.png` shows reconstructed haplotype blocks in the characteristic Battenberg cake pattern +* `[samplename]_RAFseg_chr*.png` and `[samplename]_segment_chr*.png` contains segmentation data for step 1 and step 2 respectively +* `[samplename]_nonroundedprofile.png` shows the copy number profile without rounding to integers +* `[samplename]_copynumberprofile.png` shows the copy number profile with (including subclonal copy number) rounding to integers + +## Advice for including structural variant breakpoints + +Battenberg can take prior breakpoints, from structural variants (SVs) for example, as input. SV breakpoints are typically much more precise and a pair of SVs can be closer together then what typically can be obtained from a BAF or coverage track. It is therefore adventageous to include prior breakpoints in a Battenberg run. However, including too many (as in 100s) incorrect breakpoints can have adverse effects by allowing many small segments to be affected by noise where there isn't any signal and increasing the runtime of the pipeline. It is therefore advised to `filter prior breakpoints from SVs such that the genome is slightly oversegmented.` Finally, some SV types, such as inversions, do not constitute a change in copy number and therefore also add breakpoints that should not be considered. It is therefore also advised to `filter breakpoints from SVs that do not cause a change in copynumber, such as inversions`. + +## Docker - experimental Battenberg can be run inside a Docker container. Please follow the instructions below. #### Installation ``` -wget https://raw.githubusercontent.com/Wedge-Oxford/battenberg/dev/Dockerfile -docker build -t battenberg:2.2.8 . +git clone git@github.com:Wedge-Oxford/battenberg.git +cd battenberg +docker build -t battenberg:2.2.9 . ``` -#### Download reference data +#### Reference data -To do +First, download the Battenberg reference data from the URL provided further in this README. Then in the ```impute_info.txt``` file, replace the paths to the reference files with ```/opt/battenberg_reference```. I.e. the path to the first legend file should become: + +``` +/opt/battenberg_reference/1000genomes_2012_v3_impute/ALL_1000G_phase1integrated_v3_chr1_impute.legend +``` #### Run interactively @@ -70,14 +134,14 @@ docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenbe Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar). ``` -R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout +R CMD BATCH '--no-restore-data --no-save --args -t HCC1143 -n HCC1143_BL --nb /mnt/battenberg/HCC1143_BL.bam --tb /mnt/battenberg/HCC1143.bam --sex female -o /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout ``` ### Building a release In RStudio: In the Build tab, click Check Package -Then open the NAMESPACE file and edit: +Then open the ```NAMESPACE``` file and edit: ``` S3method(plot,haplotype.data) diff --git a/inst/example/battenberg_allelecount.R b/inst/example/battenberg_allelecount.R new file mode 100644 index 0000000..7269992 --- /dev/null +++ b/inst/example/battenberg_allelecount.R @@ -0,0 +1,67 @@ +library(Battenberg) +library(optparse) + +option_list = list( + make_option(c("-t", "--tumourname"), type="character", default=NULL, help="Samplename of the tumour", metavar="character"), + make_option(c("-n", "--normalname"), type="character", default=NULL, help="Samplename of the normal", metavar="character"), + make_option(c("--tb"), type="character", default=NULL, help="Tumour BAM file", metavar="character"), + make_option(c("--nb"), type="character", default=NULL, help="Normal BAM file", metavar="character"), + make_option(c("--sex"), type="character", default=NULL, help="Sex of the sample", metavar="character"), + make_option(c("-o", "--output"), type="character", default=NULL, help="Directory where output will be written", metavar="character"), + make_option(c("--cpu"), type="numeric", default=8, help="The number of CPU cores to be used by the pipeline (Default: 8)", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +tumourname = opt$tumourname +normalname = opt$normalname +normalbam = opt$nb +tumourbam = opt$tb +ismale = opt$sex=="male" | opt$sex=="Male" +run_dir = opt$output +nthreads = opt$cpu + +############################################################################### +# 2019-04-29 +# A pure R Battenberg v2.2.9 allele counting pipeline implementation. +# sd11 [at] sanger.ac.uk +############################################################################### + +# Reference files +imputeinfofile = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" +g1000allelesprefix = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr" + +# allele counter parameters +min_base_qual = 20 +min_map_qual = 35 +allelecounter_exe = "alleleCounter" + +setwd(run_dir) + +# get all required chromosomes +chrom_names = get.chrom.names(imputeinfofile, ismale) + +# Parallel computing setup +clp = parallel::makeCluster(nthreads) +doParallel::registerDoParallel(clp) + +# run allele counter +foreach::foreach(i=1:length(chrom_names)) %dopar% { + getAlleleCounts(bam.file=tumourbam, + output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) + + getAlleleCounts(bam.file=normalbam, + output.file=paste(normalname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) +} + +# Kill the threads +parallel::stopCluster(clp) diff --git a/inst/example/battenberg_cleanup.sh b/inst/example/battenberg_cleanup.sh new file mode 100644 index 0000000..ea1a0b8 --- /dev/null +++ b/inst/example/battenberg_cleanup.sh @@ -0,0 +1,15 @@ +#!/bin/bash +samplename=$1 + +tar czf ${samplename}_battenberg_segmentation.tar.gz *egmented*txt *_segment_chr*png *RAFseg*png && rm *egmented*txt *_segment_chr*png *RAFseg*png +tar czf ${samplename}_battenberg_haplotyping.tar.gz *heterozygousMutBAFs_haplotyped.txt *_heterozygousData.png *impute_input* *_allHaplotypeInfo.txt && rm *heterozygousMutBAFs_haplotyped.txt *_heterozygousData.png *impute_input* *_allHaplotypeInfo.txt +tar czf ${samplename}_battenberg_alleleCounts.tar.gz *alleleFreq*chr* && rm *alleleFreq*chr* +if `ls | grep "mutant\|germline" | grep txt > /dev/null`; then + # SNP6 + tar czf ${samplename}_battenberg_raw_baf_logr.tar.gz *mutant*txt *germline*txt *mutant*tab *germline*tab *lleleCounts.tab && rm *mutant*txt *germline*txt *mutant*tab *germline*tab *lleleCounts.tab +else + # WGS + tar czf ${samplename}_battenberg_raw_baf_logr.tar.gz *mutant*tab *normal*tab *lleleCounts.tab && rm *mutant*tab *normal*tab *lleleCounts.tab +fi +tar czf ${samplename}_battenberg_other.tar.gz *solution_status.txt *GCwindowCorrelations* *segment_masking_details.txt sample_g.txt *subclones_1.txt *second_distance.png ${samplename}_nonroundedprofile.png ${samplename}_copynumberprofile.png && rm *solution_status.txt *GCwindowCorrelations* *segment_masking_details.txt sample_g.txt *subclones_1.txt *second_distance.png ${samplename}_nonroundedprofile.png ${samplename}_copynumberprofile.png +tar czf ${samplename}_battenberg_copynumber.tar.gz *subclones_chr*png *subclones.txt *_second_*png *rho_and_psi.txt *tumour.png *alleleratio.png *BattenbergProfile*png *coverage.png *distance.png *germline.png *totalcn_chrom_plot.png *cellularity_ploidy.txt *refit_suggestion.txt *Rout && rm *subclones_chr*png *subclones.txt *_second_*png *rho_and_psi.txt *tumour.png *alleleratio.png *BattenbergProfile*png *coverage.png *distance.png *germline.png *totalcn_chrom_plot.png *cellularity_ploidy.txt *refit_suggestion.txt *Rout \ No newline at end of file diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R old mode 100644 new mode 100755 index 8fc2cdc..f4004ce --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -1,34 +1,49 @@ -args = commandArgs(TRUE) -TUMOURNAME = toString(args[1]) -NORMALNAME = toString(args[2]) -NORMALBAM = toString(args[3]) -TUMOURBAM = toString(args[4]) -IS.MALE = as.logical(args[5]) -RUN_DIR = toString(args[6]) +library(Battenberg) +library(optparse) -SKIP_ALLELECOUNTING = F -SKIP_PREPROCESSING = F -SKIP_PHASING = F +option_list = list( + make_option(c("-t", "--tumourname"), type="character", default=NULL, help="Samplename of the tumour", metavar="character"), + make_option(c("-n", "--normalname"), type="character", default=NULL, help="Samplename of the normal", metavar="character"), + make_option(c("--tb"), type="character", default=NULL, help="Tumour BAM file", metavar="character"), + make_option(c("--nb"), type="character", default=NULL, help="Normal BAM file", metavar="character"), + make_option(c("--sex"), type="character", default=NULL, help="Sex of the sample", metavar="character"), + make_option(c("-o", "--output"), type="character", default=NULL, help="Directory where output will be written", metavar="character"), + make_option(c("--skip_allelecount"), type="logical", default=FALSE, action="store_true", help="Provide when alleles don't have to be counted. This expects allelecount files on disk", metavar="character"), + make_option(c("--skip_preprocessing"), type="logical", default=FALSE, action="store_true", help="Provide when pre-processing has previously completed. This expects the files on disk", metavar="character"), + make_option(c("--skip_phasing"), type="logical", default=FALSE, action="store_true", help="Provide when phasing has previously completed. This expects the files on disk", metavar="character"), + make_option(c("--cpu"), type="numeric", default=8, help="The number of CPU cores to be used by the pipeline (Default: 8)", metavar="character"), + make_option(c("--bp"), type="character", default=NULL, help="Optional two column file (chromosome and position) specifying prior breakpoints to be used during segmentation", metavar="character") +) -library(Battenberg) +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +TUMOURNAME = opt$tumourname +NORMALNAME = opt$normalname +NORMALBAM = opt$nb +TUMOURBAM = opt$tb +IS.MALE = opt$sex=="male" | opt$sex=="Male" +RUN_DIR = opt$output +SKIP_ALLELECOUNTING = opt$skip_allelecount +SKIP_PREPROCESSING = opt$skip_preprocessing +SKIP_PHASING = opt$skip_phasing +NTHREADS = opt$cpu +PRIOR_BREAKPOINTS_FILE = opt$bp ############################################################################### -# 2017-11-05 -# A pure R Battenberg v2.2.8 WGS pipeline implementation. -# sd11@sanger.ac.uk +# 2018-11-01 +# A pure R Battenberg v2.2.9 WGS pipeline implementation. +# sd11 [at] sanger.ac.uk ############################################################################### -# Parallelism parameters -NTHREADS = 8 - # General static -IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" -G1000PREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr" -G1000PREFIX_AC = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr" -GCCORRECTPREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_" +IMPUTEINFOFILE = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" +G1000PREFIX = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr" +G1000PREFIX_AC = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr" +GCCORRECTPREFIX = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3_noNA/1000_genomes_GC_corr_chr_" +REPLICCORRECTPREFIX = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_wgs_replic_correction_1000g_v3/1000_genomes_replication_timing_chr_" IMPUTE_EXE = "impute2" - PLATFORM_GAMMA = 1 PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 @@ -44,11 +59,11 @@ BALANCED_THRESHOLD = 0.51 MIN_NORMAL_DEPTH = 10 MIN_BASE_QUAL = 20 MIN_MAP_QUAL = 35 -CALC_SEG_BAF_OPTION = 1 +CALC_SEG_BAF_OPTION = 3 # WGS specific static ALLELECOUNTER = "alleleCounter" -PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" +PROBLEMLOCI = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" # Change to work directory and load the chromosome information setwd(RUN_DIR) @@ -62,6 +77,7 @@ battenberg(tumourname=TUMOURNAME, g1000prefix=G1000PREFIX, g1000allelesprefix=G1000PREFIX_AC, gccorrectprefix=GCCORRECTPREFIX, + repliccorrectprefix=REPLICCORRECTPREFIX, problemloci=PROBLEMLOCI, data_type="wgs", impute_exe=IMPUTE_EXE, @@ -85,173 +101,5 @@ battenberg(tumourname=TUMOURNAME, calc_seg_baf_option=CALC_SEG_BAF_OPTION, skip_allele_counting=SKIP_ALLELECOUNTING, skip_preprocessing=SKIP_PREPROCESSING, - skip_phasing=SKIP_PHASING) - - - - -# chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE) -# -# # Setup for parallel computing -# clp = makeCluster(NTHREADS) -# registerDoParallel(clp) -# -# # Obtain allele counts for 1000 Genomes locations for both tumour and normal -# foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { -# getAlleleCounts(bam.file=TUMOURBAM, -# output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), -# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), -# min.base.qual=MIN_BASE_QUAL, -# min.map.qual=MIN_MAP_QUAL, -# allelecounter.exe=ALLELECOUNTER) -# -# getAlleleCounts(bam.file=NORMALBAM, -# output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), -# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), -# min.base.qual=MIN_BASE_QUAL, -# min.map.qual=MIN_MAP_QUAL, -# allelecounter.exe=ALLELECOUNTER) -# -# } -# # Obtain BAF and LogR from the raw allele counts -# getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), -# normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""), -# figuresFile.prefix=paste(TUMOURNAME, "_", sep=''), -# BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), -# BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), -# logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), -# logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), -# combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""), -# chr_names=chrom_names, -# g1000file.prefix=G1000PREFIX, -# minCounts=MIN_NORMAL_DEPTH, -# samplename=TUMOURNAME) -# # Perform GC correction -# gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), -# outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), -# correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""), -# gc_content_file_prefix=GCCORRECTPREFIX, -# chrom_names=chrom_names) -# -# -# # These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster. -# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% { -# print(chrom) -# # Transform the allele counts into something that impute understands -# generate.impute.input.wgs(chrom=chrom, -# tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), -# normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), -# output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), -# imputeinfofile=IMPUTEINFOFILE, -# is.male=IS.MALE, -# problemLociFile=PROBLEMLOCI, -# useLociFile=NA) -# -# # Run impute on the files -# run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), -# outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), -# is.male=IS.MALE, -# imputeinfofile=IMPUTEINFOFILE, -# impute.exe=IMPUTE_EXE, -# region.size=5000000, -# chrom=chrom) -# -# # As impute runs in windows across a chromosome we need to assemble the output -# combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), -# outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), -# is.male=IS.MALE, -# imputeinfofile=IMPUTEINFOFILE, -# region.size=5000000, -# chrom=chrom) -# -# # Transform the impute output into haplotyped BAFs -# GetChromosomeBAFs(chrom=chrom, -# SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), -# haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), -# samplename=TUMOURNAME, -# outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), -# chr_names=chrom_names, -# minCounts=MIN_NORMAL_DEPTH) -# -# # Plot what we have until this point -# plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), -# imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), -# samplename=TUMOURNAME, -# chrom=chrom, -# chr_names=chrom_names) -# -# # Cleanup temp Impute output -# unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) -# } -# -# # Kill the threads as from here its all single core -# stopCluster(clp) -# -# # Combine all the BAF output into a single file -# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), -# inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", -# outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), -# no.chrs=length(chrom_names)) -# -# # Segment the phased and haplotyped BAF data -# segment.baf.phased(samplename=TUMOURNAME, -# inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), -# outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), -# gamma=SEGMENTATION_GAMMA, -# phasegamma=PHASING_GAMMA, -# kmin=3, -# phasekmin=1, -# calc_seg_baf_option=CALC_SEG_BAF_OPTION) -# -# # Fit a clonal copy number profile -# fit.copy.number(samplename=TUMOURNAME, -# outputfile.prefix=paste(TUMOURNAME, "_", sep=""), -# inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), -# inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), -# inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), -# dist_choice=CLONALITY_DIST_METRIC, -# ascat_dist_choice=ASCAT_DIST_METRIC, -# min.ploidy=MIN_PLOIDY, -# max.ploidy=MAX_PLOIDY, -# min.rho=MIN_RHO, -# min.goodness=MIN_GOODNESS_OF_FIT, -# uninformative_BAF_threshold=BALANCED_THRESHOLD, -# gamma_param=PLATFORM_GAMMA, -# use_preset_rho_psi=F, -# preset_rho=NA, -# preset_psi=NA, -# read_depth=30) -# -# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state -# callSubclones(sample.name=TUMOURNAME, -# baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), -# logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), -# rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), -# output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), -# output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), -# output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), -# masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""), -# sv_breakpoints_file="NA", -# chr_names=chrom_names, -# gamma=PLATFORM_GAMMA, -# segmentation.gamma=NA, -# siglevel=0.05, -# maxdist=0.01, -# noperms=1000, -# calc_seg_baf_option=CALC_SEG_BAF_OPTION) -# -# # Make some post-hoc plots -# make_posthoc_plots(samplename=TUMOURNAME, -# logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), -# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), -# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), -# bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), -# logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), -# allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep="")) -# -# # Save refit suggestions for a future rerun -# cnfit_to_refit_suggestions(samplename=TUMOURNAME, -# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), -# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), -# gamma_param=PLATFORM_GAMMA) -# + skip_phasing=SKIP_PHASING, + prior_breakpoints_file=PRIOR_BREAKPOINTS_FILE) diff --git a/inst/example/filter_sv_brass.R b/inst/example/filter_sv_brass.R new file mode 100644 index 0000000..f57a946 --- /dev/null +++ b/inst/example/filter_sv_brass.R @@ -0,0 +1,45 @@ + +library(optparse) +library(gtools) + +option_list = list( + make_option(c("-i", "--input"), type="character", default=NULL, help="Input VCF file", metavar="character"), + make_option(c("-o", "--output"), type="character", default=NULL, help="Output VCF file", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +infile = opt$input +outfile = opt$output +genome = opt$genome + +brass = read.table(infile, header=F, comment.char="#", stringsAsFactor=F) + +# fetch TRDS entry +trds_data = lapply(brass$V8, function(x) { r=unlist(strsplit(as.character(x), ";")); sel=grepl("TRDS", r); if (any(sel)) { unlist(strsplit(gsub("TRDS=", "", r[sel]), ",")) } else { NULL } }) +brass$tumour_support = unlist(lapply(trds_data, length)) + +brass_filter = brass[brass$tumour_support > 0,] + +# fetch second breakpoint, sometimes it is not mentioned as a first breakpoint +#second_chrpos = unlist(as.list(brass)$ALT)) +second_chrpos = brass_filter$V5 +second_chrpos = unlist(lapply(second_chrpos, function(x) { + if (grepl("\\[", x)) { unlist(strsplit(x, "\\["))[2] + } else { unlist(strsplit(x, "\\]"))[2] }} + )) + +brass_breakpoints = data.frame(chromosome=brass_filter$V1, position=brass_filter$V2, stringsAsFactors=F) +brass_breakpoints = rbind(brass_breakpoints, data.frame(chromosome=unlist(lapply(second_chrpos, function(x) unlist(strsplit(x, ":"))[1])), position=as.numeric(unlist(lapply(second_chrpos, function(x) unlist(strsplit(x, ":"))[2]))), stringsAsFactors=F)) +brass_breakpoints = unique(brass_breakpoints) + +# sort +brass_breakpoints_ordered = df = data.frame(matrix(ncol = 2, nrow = 0)) +colnames(brass_breakpoints_ordered) = c("chromosome", "position") +for (chrom in gtools::mixedsort(unique(brass_breakpoints$chromosome))) { + b_chrom = brass_breakpoints[brass_breakpoints$chromosome==chrom,] + b_chrom = b_chrom[order(b_chrom$position),] + brass_breakpoints_ordered = rbind(brass_breakpoints_ordered, b_chrom) +} +write.table(brass_breakpoints_ordered, file=outfile, quote=F, row.names=F, sep="\t") diff --git a/man/GetChromosomeBAFs.Rd b/man/GetChromosomeBAFs.Rd index c25c752..bc4e0f0 100644 --- a/man/GetChromosomeBAFs.Rd +++ b/man/GetChromosomeBAFs.Rd @@ -4,8 +4,15 @@ \alias{GetChromosomeBAFs} \title{Morphs phased SNPs from WGS input into haplotype blocks} \usage{ -GetChromosomeBAFs(chrom, SNP_file, haplotypeFile, samplename, outfile, - chr_names, minCounts = 1) +GetChromosomeBAFs( + chrom, + SNP_file, + haplotypeFile, + samplename, + outfile, + chr_names, + minCounts = 1 +) } \arguments{ \item{chrom}{The chromosome number for which this function is called.} diff --git a/man/GetChromosomeBAFs_SNP6.Rd b/man/GetChromosomeBAFs_SNP6.Rd index 5db5c98..aaadfa8 100644 --- a/man/GetChromosomeBAFs_SNP6.Rd +++ b/man/GetChromosomeBAFs_SNP6.Rd @@ -4,8 +4,14 @@ \alias{GetChromosomeBAFs_SNP6} \title{Morphs phased SNPs from SNP6 input into haplotype blocks} \usage{ -GetChromosomeBAFs_SNP6(chrom, alleleFreqFile, haplotypeFile, samplename, - outputfile, chr_names) +GetChromosomeBAFs_SNP6( + chrom, + alleleFreqFile, + haplotypeFile, + samplename, + outputfile, + chr_names +) } \arguments{ \item{chrom}{The chromosome number for which this function should run.} diff --git a/man/allele_ratio_plot.Rd b/man/allele_ratio_plot.Rd index f7c768c..37f0bb6 100644 --- a/man/allele_ratio_plot.Rd +++ b/man/allele_ratio_plot.Rd @@ -4,8 +4,14 @@ \alias{allele_ratio_plot} \title{Plot allele ratios from raw segmented data} \usage{ -allele_ratio_plot(samplename, bafsegmented, logrsegmented, outputfile, logr, - max.plot.cn = 5) +allele_ratio_plot( + samplename, + bafsegmented, + logrsegmented, + outputfile, + logr, + max.plot.cn = 5 +) } \arguments{ \item{samplename}{Name of the sample for the plot title} diff --git a/man/battenberg.Rd b/man/battenberg.Rd index a8d75d9..202bac2 100644 --- a/man/battenberg.Rd +++ b/man/battenberg.Rd @@ -4,22 +4,49 @@ \alias{battenberg} \title{Run the Battenberg pipeline} \usage{ -battenberg(tumourname, normalname, tumour_data_file, normal_data_file, - imputeinfofile, g1000prefix, problemloci, gccorrectprefix = NA, - g1000allelesprefix = NA, ismale = NA, data_type = "wgs", - impute_exe = "impute2", allelecounter_exe = "alleleCounter", - nthreads = 8, platform_gamma = 1, phasing_gamma = 1, - segmentation_gamma = 10, segmentation_kmin = 3, phasing_kmin = 1, - clonality_dist_metric = 0, ascat_dist_metric = 1, min_ploidy = 1.6, - max_ploidy = 4.8, min_rho = 0.1, min_goodness = 0.63, - uninformative_BAF_threshold = 0.51, min_normal_depth = 10, - min_base_qual = 20, min_map_qual = 35, calc_seg_baf_option = 1, - skip_allele_counting = F, skip_preprocessing = F, skip_phasing = F, +battenberg( + tumourname, + normalname, + tumour_data_file, + normal_data_file, + imputeinfofile, + g1000prefix, + problemloci, + gccorrectprefix = NULL, + repliccorrectprefix = NULL, + g1000allelesprefix = NA, + ismale = NA, + data_type = "wgs", + impute_exe = "impute2", + allelecounter_exe = "alleleCounter", + nthreads = 8, + platform_gamma = 1, + phasing_gamma = 1, + segmentation_gamma = 10, + segmentation_kmin = 3, + phasing_kmin = 1, + clonality_dist_metric = 0, + ascat_dist_metric = 1, + min_ploidy = 1.6, + max_ploidy = 4.8, + min_rho = 0.1, + min_goodness = 0.63, + uninformative_BAF_threshold = 0.51, + min_normal_depth = 10, + min_base_qual = 20, + min_map_qual = 35, + calc_seg_baf_option = 3, + skip_allele_counting = F, + skip_preprocessing = F, + skip_phasing = F, snp6_reference_info_file = NA, apt.probeset.genotype.exe = "apt-probeset-genotype", apt.probeset.summarize.exe = "apt-probeset-summarize", norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", - birdseed_report_file = "birdseed.report.txt", heterozygousFilter = "none") + birdseed_report_file = "birdseed.report.txt", + heterozygousFilter = "none", + prior_breakpoints_file = NULL +) } \arguments{ \item{tumourname}{Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.} @@ -36,7 +63,9 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file, \item{problemloci}{Full path to a problem loci file that contains SNP loci that should be filtered out} -\item{gccorrectprefix}{Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} +\item{gccorrectprefix}{Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NULL)} + +\item{repliccorrectprefix}{Full prefix path to replication timing files, as part of the Battenberg reference data, not required for SNP6 data (Default: NULL)} \item{g1000allelesprefix}{Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} @@ -80,7 +109,7 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file, \item{min_map_qual}{Minimum mapping quality required for a read to be counted when allele counting (Default: 35)} -\item{calc_seg_baf_option}{Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)} +\item{calc_seg_baf_option}{Sets way to calculate BAF per segment: 1=mean, 2=median, 3=ifelse median==0 | 1, mean, median (Default: 3)} \item{skip_allele_counting}{Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)} @@ -99,6 +128,8 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file, \item{birdseed_report_file}{Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)} \item{heterozygousFilter}{Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")} + +\item{prior_breakpoints_file}{A two column file with prior breakpoints to be used during segmentation (Default: NULL)} } \description{ Run the Battenberg pipeline diff --git a/man/calc_psi_t.Rd b/man/calc_psi_t.Rd deleted file mode 100644 index 5b15594..0000000 --- a/man/calc_psi_t.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clonal_ascat.R -\name{calc_psi_t} -\alias{calc_psi_t} -\title{Calculate psi based on a reference segment and its associated logr} -\usage{ -calc_psi_t(total_cn, r, rho, gamma_param) -} -\arguments{ -\item{total_cn}{Integer representing the total clonal copynumber (i.e. nMajor+nMinor)} - -\item{r}{The LogR of the segment with the total_cn copy number} - -\item{rho}{A cellularity estimate} - -\item{gamma_param}{Platform gamma parameter} -} -\description{ -Calculate psi based on a reference segment and its associated logr -} -\author{ -sd11 -} diff --git a/man/callSubclones.Rd b/man/callSubclones.Rd index de8c18f..95e0bff 100644 --- a/man/callSubclones.Rd +++ b/man/callSubclones.Rd @@ -4,11 +4,26 @@ \alias{callSubclones} \title{Fit subclonal copy number} \usage{ -callSubclones(sample.name, baf.segmented.file, logr.file, rho.psi.file, - output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, - masking_output_file, max_allowed_state = 250, sv_breakpoints_file = NULL, - gamma = 1, segmentation.gamma = NA, siglevel = 0.05, maxdist = 0.01, - noperms = 1000, seed = as.integer(Sys.time()), calc_seg_baf_option = 1) +callSubclones( + sample.name, + baf.segmented.file, + logr.file, + rho.psi.file, + output.file, + output.figures.prefix, + output.gw.figures.prefix, + chr_names, + masking_output_file, + max_allowed_state = 250, + prior_breakpoints_file = NULL, + gamma = 1, + segmentation.gamma = NA, + siglevel = 0.05, + maxdist = 0.01, + noperms = 1000, + seed = as.integer(Sys.time()), + calc_seg_baf_option = 3 +) } \arguments{ \item{sample.name}{Name of the sample, used in figures} @@ -31,7 +46,7 @@ callSubclones(sample.name, baf.segmented.file, logr.file, rho.psi.file, \item{max_allowed_state}{The maximum CN state allowed (Default 100)} -\item{sv_breakpoints_file}{A two column file with breakpoints from structural variants. These are used when making the figures} +\item{prior_breakpoints_file}{A two column file with prior breakpoints, possibly from structural variants. This file must contain two columns: chromosome and position. These are used when making the figures} \item{gamma}{Technology specific scaling parameter for LogR (Default 1)} @@ -45,7 +60,7 @@ callSubclones(sample.name, baf.segmented.file, logr.file, rho.psi.file, \item{seed}{Seed to set when performing bootstrapping (Default: Current time)} -\item{calc_seg_baf_option}{Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1)} +\item{calc_seg_baf_option}{Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median==0|1, mean, median. (Default: 3)} } \description{ This function fits a subclonal copy number profile where a clonal profile is unlikely. diff --git a/man/cel2baf.logr.Rd b/man/cel2baf.logr.Rd index 8bf4f29..1fa085e 100644 --- a/man/cel2baf.logr.Rd +++ b/man/cel2baf.logr.Rd @@ -4,11 +4,15 @@ \alias{cel2baf.logr} \title{Transform cel files into BAF and LogR} \usage{ -cel2baf.logr(normal_cel_file, tumour_cel_file, output_file, +cel2baf.logr( + normal_cel_file, + tumour_cel_file, + output_file, snp6_reference_info_file, apt.probeset.genotype.exe = "apt-probeset-genotype", apt.probeset.summarize.exe = "apt-probeset-summarize", - norm.geno.clust.exe = "normalize_affy_geno_cluster.pl") + norm.geno.clust.exe = "normalize_affy_geno_cluster.pl" +) } \arguments{ \item{normal_cel_file}{String that points to the cel file containing the matched normal data} diff --git a/man/check.imputeinfofile.Rd b/man/check.imputeinfofile.Rd new file mode 100644 index 0000000..18cf0a7 --- /dev/null +++ b/man/check.imputeinfofile.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute.R +\name{check.imputeinfofile} +\alias{check.imputeinfofile} +\title{Check impute info file consistency} +\usage{ +check.imputeinfofile(imputeinfofile, is.male) +} +\arguments{ +\item{imputeinfofile}{Path to the imputeinfofile on disk.} +} +\description{ +Check impute info file consistency +} +\author{ +sd11 +} diff --git a/man/cnfit_to_refit_suggestions.Rd b/man/cnfit_to_refit_suggestions.Rd index a8238b6..9f04b34 100644 --- a/man/cnfit_to_refit_suggestions.Rd +++ b/man/cnfit_to_refit_suggestions.Rd @@ -4,8 +4,13 @@ \alias{cnfit_to_refit_suggestions} \title{Create refit suggestions for a fit copy number profile} \usage{ -cnfit_to_refit_suggestions(samplename, subclones_file, rho_psi_file, - gamma_param, min_segment_size_mb = 2) +cnfit_to_refit_suggestions( + samplename, + subclones_file, + rho_psi_file, + gamma_param, + min_segment_size_mb = 2 +) } \arguments{ \item{samplename}{Samplename for the output file} diff --git a/man/combine.impute.output.Rd b/man/combine.impute.output.Rd index 5e5e115..850e5ab 100644 --- a/man/combine.impute.output.Rd +++ b/man/combine.impute.output.Rd @@ -4,8 +4,14 @@ \alias{combine.impute.output} \title{Concatenate the impute output generated for each of the regions.} \usage{ -combine.impute.output(inputfile.prefix, outputfile, is.male, imputeinfofile, - region.size = 5e+06, chrom = NA) +combine.impute.output( + inputfile.prefix, + outputfile, + is.male, + imputeinfofile, + region.size = 5000000, + chrom = NA +) } \arguments{ \item{inputfile.prefix}{Prefix of the input files (this is typically the outputfile.prefix option supplied when calling run.impute).} diff --git a/man/find_centroid_of_global_minima.Rd b/man/find_centroid_of_global_minima.Rd deleted file mode 100644 index 91a6a1e..0000000 --- a/man/find_centroid_of_global_minima.Rd +++ /dev/null @@ -1,60 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clonal_ascat.R -\name{find_centroid_of_global_minima} -\alias{find_centroid_of_global_minima} -\title{This function is an alternative procedure for finding the optimum (psi, rho) pair. -This function first finds all the find all the global optima, -and then finds the centroid of this set of globla optima. -Then we find the global optimum which is nearest to the centroid. -(When the set of global optima is convex, we expect the selected optimum to be at the centroid.)} -\usage{ -find_centroid_of_global_minima(d, ref_seg_matrix, ref_major, ref_minor, s, - dist_choice, minimise, new_bounds, distancepng, gamma_param, siglevel_BAF, - maxdist_BAF, siglevel_LogR, maxdist_LogR, allow100percent, - uninformative_BAF_threshold, read_depth) -} -\arguments{ -\item{d}{A distance matrix} - -\item{ref_seg_matrix}{The corresponding ref seg matrix that belongs to d} - -\item{ref_major}{The corresponding major allele values with d} - -\item{ref_minor}{The corresponding minor allele values with d} - -\item{s}{A segmented BAF/LogR data.frame from \code{get_segment_info}} - -\item{dist_choice}{Some distance metrics require adaptation of the data (i.e. log transform)} - -\item{minimise}{Boolean whether we're minimising or maximising} - -\item{new_bounds}{The rho/psi boundaries between we are searching for a solution. This is a named list with values psi_min, psi_max, rho_min, rho_max} - -\item{distancepng}{String where the sunrise distance plot will be saved} - -\item{gamma_param}{The platform gamma} - -\item{siglevel_BAF}{The level at which BAF becomes significant TODO: this option is no longer used} - -\item{maxdist_BAF}{TODO: this option is no longer used} - -\item{siglevel_LogR}{The p-value at which logR becomes significant when establishing whether a segment should be subclonal} - -\item{maxdist_LogR}{The maximum distance allowed as slack when establishing the significance. This allows for the case when a breakpoint is missed, the segment would then not automatically become subclonal} - -\item{allow100percent}{Boolean whether to allow for a 100"\%" cellularity solution} - -\item{uninformative_BAF_threshold}{The threshold above which BAF becomes uninformative} - -\item{read_depth}{TODO: this option is no longer used} -} -\value{ -A list with fields optima_info_without_ref and optima_info -} -\description{ -This function is an alternative procedure for finding the optimum (psi, rho) pair. -This function first finds all the find all the global optima, -and then finds the centroid of this set of globla optima. -Then we find the global optimum which is nearest to the centroid. -(When the set of global optima is convex, we expect the selected optimum to be at the centroid.) -} diff --git a/man/fit.copy.number.Rd b/man/fit.copy.number.Rd index fe48b8e..fd8de33 100644 --- a/man/fit.copy.number.Rd +++ b/man/fit.copy.number.Rd @@ -4,12 +4,26 @@ \alias{fit.copy.number} \title{Fit copy number} \usage{ -fit.copy.number(samplename, outputfile.prefix, inputfile.baf.segmented, - inputfile.baf, inputfile.logr, dist_choice, ascat_dist_choice, - min.ploidy = 1.6, max.ploidy = 4.8, min.rho = 0.1, max.rho = 1, - min.goodness = 63, uninformative_BAF_threshold = 0.51, gamma_param = 1, - use_preset_rho_psi = F, preset_rho = NA, preset_psi = NA, - read_depth = 30) +fit.copy.number( + samplename, + outputfile.prefix, + inputfile.baf.segmented, + inputfile.baf, + inputfile.logr, + dist_choice, + ascat_dist_choice, + min.ploidy = 1.6, + max.ploidy = 4.8, + min.rho = 0.1, + max.rho = 1, + min.goodness = 63, + uninformative_BAF_threshold = 0.51, + gamma_param = 1, + use_preset_rho_psi = F, + preset_rho = NA, + preset_psi = NA, + read_depth = 30 +) } \arguments{ \item{samplename}{Samplename used to name the segmented logr output file} diff --git a/man/gc.correct.Rd b/man/gc.correct.Rd index db07337..a18be45 100644 --- a/man/gc.correct.Rd +++ b/man/gc.correct.Rd @@ -4,10 +4,18 @@ \alias{gc.correct} \title{Correct the LogR estimates for GC content} \usage{ -gc.correct(samplename, infile.logr.baf, outfile.tumor.LogR, outfile.tumor.BAF, - outfile.normal.LogR, outfile.normal.BAF, outfile.probeBAF, - snp6_reference_info_file, chr_names, - birdseed_report_file = "birdseed.report.txt") +gc.correct( + samplename, + infile.logr.baf, + outfile.tumor.LogR, + outfile.tumor.BAF, + outfile.normal.LogR, + outfile.normal.BAF, + outfile.probeBAF, + snp6_reference_info_file, + chr_names, + birdseed_report_file = "birdseed.report.txt" +) } \arguments{ \item{samplename}{Name of the sample to be used to name columns} diff --git a/man/gc.correct.wgs.Rd b/man/gc.correct.wgs.Rd index 3abbe52..9e56f2d 100644 --- a/man/gc.correct.wgs.Rd +++ b/man/gc.correct.wgs.Rd @@ -4,8 +4,15 @@ \alias{gc.correct.wgs} \title{Function to correct LogR for waivyness that correlates with GC content} \usage{ -gc.correct.wgs(Tumour_LogR_file, outfile, correlations_outfile, - gc_content_file_prefix, chrom_names, recalc_corr_afterwards = F) +gc.correct.wgs( + Tumour_LogR_file, + outfile, + correlations_outfile, + gc_content_file_prefix, + replic_timing_file_prefix, + chrom_names, + recalc_corr_afterwards = F +) } \arguments{ \item{Tumour_LogR_file}{String pointing to the tumour LogR output} @@ -18,13 +25,15 @@ gc.correct.wgs(Tumour_LogR_file, outfile, correlations_outfile, found. These files should be split per chromosome and this prefix must contain the full path until chr in its name. The .txt extension is automatically added.} +\item{replic_timing_file_prefix}{Like the gc_content_file_prefix, containing replication timing info (supply NULL if no replication timing correction is to be applied)} + \item{chrom_names}{A vector containing chromosome names to be considered} -\item{recalc_corr_afterwards}{Set to TRUE to recalculate correlations with GC content after correction} +\item{recalc_corr_afterwards}{Set to TRUE to recalculate correlations after correction} } \description{ Function to correct LogR for waivyness that correlates with GC content } \author{ -sd11 +jonas demeulemeester, sd11 } diff --git a/man/generate.impute.input.snp6.Rd b/man/generate.impute.input.snp6.Rd index 80a947c..e72abeb 100644 --- a/man/generate.impute.input.snp6.Rd +++ b/man/generate.impute.input.snp6.Rd @@ -4,9 +4,18 @@ \alias{generate.impute.input.snp6} \title{Prepares data for impute} \usage{ -generate.impute.input.snp6(infile.germlineBAF, infile.tumourBAF, outFileStart, - chrom, chr_names, problemLociFile, snp6_reference_info_file, imputeinfofile, - is.male, heterozygousFilter = "none") +generate.impute.input.snp6( + infile.germlineBAF, + infile.tumourBAF, + outFileStart, + chrom, + chr_names, + problemLociFile, + snp6_reference_info_file, + imputeinfofile, + is.male, + heterozygousFilter = "none" +) } \arguments{ \item{infile.germlineBAF}{Germline BAF file generated by \code{cel2baf.logr}} diff --git a/man/generate.impute.input.wgs.Rd b/man/generate.impute.input.wgs.Rd index 1975506..35c8bd9 100644 --- a/man/generate.impute.input.wgs.Rd +++ b/man/generate.impute.input.wgs.Rd @@ -4,9 +4,17 @@ \alias{generate.impute.input.wgs} \title{Prepare data for impute} \usage{ -generate.impute.input.wgs(chrom, tumour.allele.counts.file, - normal.allele.counts.file, output.file, imputeinfofile, is.male, - problemLociFile = NA, useLociFile = NA, heterozygousFilter = 0.1) +generate.impute.input.wgs( + chrom, + tumour.allele.counts.file, + normal.allele.counts.file, + output.file, + imputeinfofile, + is.male, + problemLociFile = NA, + useLociFile = NA, + heterozygousFilter = 0.1 +) } \arguments{ \item{chrom}{The chromosome for which impute input should be generated.} diff --git a/man/getAlleleCounts.Rd b/man/getAlleleCounts.Rd index b00b378..64d961f 100644 --- a/man/getAlleleCounts.Rd +++ b/man/getAlleleCounts.Rd @@ -4,8 +4,14 @@ \alias{getAlleleCounts} \title{Obtain allele counts for 1000 Genomes loci through external program alleleCount} \usage{ -getAlleleCounts(bam.file, output.file, g1000.loci, min.base.qual = 20, - min.map.qual = 35, allelecounter.exe = "alleleCounter") +getAlleleCounts( + bam.file, + output.file, + g1000.loci, + min.base.qual = 20, + min.map.qual = 35, + allelecounter.exe = "alleleCounter" +) } \arguments{ \item{bam.file}{A BAM alignment file on which the counter should be run.} diff --git a/man/getBAFsAndLogRs.Rd b/man/getBAFsAndLogRs.Rd index 1e36064..059ce2c 100644 --- a/man/getBAFsAndLogRs.Rd +++ b/man/getBAFsAndLogRs.Rd @@ -4,10 +4,21 @@ \alias{getBAFsAndLogRs} \title{Obtain BAF and LogR from the allele counts} \usage{ -getBAFsAndLogRs(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, - figuresFile.prefix, BAFnormalFile, BAFmutantFile, logRnormalFile, - logRmutantFile, combinedAlleleCountsFile, chr_names, g1000file.prefix, - minCounts = NA, samplename = "sample1", seed = as.integer(Sys.time())) +getBAFsAndLogRs( + tumourAlleleCountsFile.prefix, + normalAlleleCountsFile.prefix, + figuresFile.prefix, + BAFnormalFile, + BAFmutantFile, + logRnormalFile, + logRmutantFile, + combinedAlleleCountsFile, + chr_names, + g1000file.prefix, + minCounts = NA, + samplename = "sample1", + seed = as.integer(Sys.time()) +) } \arguments{ \item{tumourAlleleCountsFile.prefix}{Prefix of the allele counts files for the tumour.} diff --git a/man/make_posthoc_plots.Rd b/man/make_posthoc_plots.Rd index a04f582..74e950f 100644 --- a/man/make_posthoc_plots.Rd +++ b/man/make_posthoc_plots.Rd @@ -4,8 +4,15 @@ \alias{make_posthoc_plots} \title{Function to make additional figures} \usage{ -make_posthoc_plots(samplename, logr_file, subclones_file, rho_psi_file, - bafsegmented_file, logrsegmented_file, allelecounts_file = NULL) +make_posthoc_plots( + samplename, + logr_file, + subclones_file, + rho_psi_file, + bafsegmented_file, + logrsegmented_file, + allelecounts_file = NULL +) } \arguments{ \item{samplename}{Name of the sample for the plot title} diff --git a/man/plot.haplotype.data.Rd b/man/plot.haplotype.data.Rd index de8d559..f56ae66 100644 --- a/man/plot.haplotype.data.Rd +++ b/man/plot.haplotype.data.Rd @@ -4,8 +4,7 @@ \alias{plot.haplotype.data} \title{Plot haplotyped SNPs} \usage{ -\method{plot}{haplotype.data}(haplotyped.baf.file, imageFileName, samplename, - chrom, chr_names) +\method{plot}{haplotype.data}(haplotyped.baf.file, imageFileName, samplename, chrom, chr_names) } \arguments{ \item{haplotyped.baf.file}{File containing the haplotyped SNP info.} diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd index dfa23bc..deea287 100644 --- a/man/prepare_snp6.Rd +++ b/man/prepare_snp6.Rd @@ -4,12 +4,17 @@ \alias{prepare_snp6} \title{Prepare SNP6 data for haplotype construction} \usage{ -prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names, +prepare_snp6( + tumour_cel_file, + normal_cel_file, + tumourname, + chrom_names, snp6_reference_info_file, apt.probeset.genotype.exe = "apt-probeset-genotype", apt.probeset.summarize.exe = "apt-probeset-summarize", norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", - birdseed_report_file = "birdseed.report.txt") + birdseed_report_file = "birdseed.report.txt" +) } \arguments{ \item{tumour_cel_file}{Full path to a CEL file containing the tumour raw data} diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd index 638d030..488f0d3 100644 --- a/man/prepare_wgs.Rd +++ b/man/prepare_wgs.Rd @@ -4,9 +4,23 @@ \alias{prepare_wgs} \title{Prepare WGS data for haplotype construction} \usage{ -prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname, - g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual, - allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) +prepare_wgs( + chrom_names, + tumourbam, + normalbam, + tumourname, + normalname, + g1000allelesprefix, + g1000prefix, + gccorrectprefix, + repliccorrectprefix, + min_base_qual, + min_map_qual, + allelecounter_exe, + min_normal_depth, + nthreads, + skip_allele_counting +) } \arguments{ \item{chrom_names}{A vector containing the names of chromosomes to be included} @@ -25,6 +39,8 @@ prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname, \item{gccorrectprefix}{Prefix path to GC content reference data} +\item{repliccorrectprefix}{Prefix path to replication timing reference data (supply NULL if no replication timing correction is to be applied)} + \item{min_base_qual}{Minimum base quality required for a read to be counted} \item{min_map_qual}{Minimum mapping quality required for a read to be counted} diff --git a/man/read_baf.Rd b/man/read_baf.Rd new file mode 100644 index 0000000..bfaa3f8 --- /dev/null +++ b/man/read_baf.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{read_baf} +\alias{read_baf} +\title{Parser for BAF data} +\usage{ +read_baf(filename, header = T) +} +\arguments{ +\item{filename}{Filename of the file to read in} + +\item{header}{Whether the file contains a header (Default: TRUE)} +} +\value{ +A data frame with BAF content +} +\description{ +Parser for BAF data +} diff --git a/man/read_bafsegmented.Rd b/man/read_bafsegmented.Rd new file mode 100644 index 0000000..b5c5cd1 --- /dev/null +++ b/man/read_bafsegmented.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{read_bafsegmented} +\alias{read_bafsegmented} +\title{Parser for BAFsegmented data} +\usage{ +read_bafsegmented(filename, header = T) +} +\arguments{ +\item{filename}{Filename of the file to read in} + +\item{header}{Whether the file contains a header (Default: TRUE)} +} +\value{ +A data frame with BAFsegmented content +} +\description{ +Parser for BAFsegmented data +} diff --git a/man/read_gccontent.Rd b/man/read_gccontent.Rd new file mode 100644 index 0000000..d8abb04 --- /dev/null +++ b/man/read_gccontent.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{read_gccontent} +\alias{read_gccontent} +\title{Parser for GC content reference data} +\usage{ +read_gccontent(filename) +} +\arguments{ +\item{filename}{Filename of the file to read in} +} +\value{ +A data frame with GC content +} +\description{ +Parser for GC content reference data +} diff --git a/man/read_logr.Rd b/man/read_logr.Rd new file mode 100644 index 0000000..35407eb --- /dev/null +++ b/man/read_logr.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{read_logr} +\alias{read_logr} +\title{Parser for logR data} +\usage{ +read_logr(filename, header = T) +} +\arguments{ +\item{filename}{Filename of the file to read in} + +\item{header}{Whether the file contains a header (Default: TRUE)} +} +\value{ +A data frame with logR content +} +\description{ +Parser for logR data +} diff --git a/man/read_replication.Rd b/man/read_replication.Rd new file mode 100644 index 0000000..8fb29a5 --- /dev/null +++ b/man/read_replication.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{read_replication} +\alias{read_replication} +\title{Parser for replication timing reference data} +\usage{ +read_replication(filename) +} +\arguments{ +\item{filename}{Filename of the file to read in} +} +\value{ +A data frame with replication timing +} +\description{ +Parser for replication timing reference data +} diff --git a/man/read_table_generic.Rd b/man/read_table_generic.Rd index f193422..8719dd2 100644 --- a/man/read_table_generic.Rd +++ b/man/read_table_generic.Rd @@ -4,8 +4,15 @@ \alias{read_table_generic} \title{Generic reading function using the readr R package, tailored for reading in genomic data} \usage{ -read_table_generic(file, header = T, row.names = F, stringsAsFactor = F, - sep = "\\t", chrom_col = 1, skip = 0) +read_table_generic( + file, + header = T, + row.names = F, + stringsAsFactor = F, + sep = "\\t", + chrom_col = 1, + skip = 0 +) } \arguments{ \item{file}{Filename of the file to read in} diff --git a/man/run.impute.Rd b/man/run.impute.Rd index d3ff2f7..145a61b 100644 --- a/man/run.impute.Rd +++ b/man/run.impute.Rd @@ -4,9 +4,16 @@ \alias{run.impute} \title{Run impute on the specified inputfile} \usage{ -run.impute(inputfile, outputfile.prefix, is.male, imputeinfofile, - impute.exe = "impute2", region.size = 5e+06, chrom = NA, - seed = as.integer(Sys.time())) +run.impute( + inputfile, + outputfile.prefix, + is.male, + imputeinfofile, + impute.exe = "impute2", + region.size = 5000000, + chrom = NA, + seed = as.integer(Sys.time()) +) } \arguments{ \item{inputfile}{Full path to a csv file with columns: Physical.Position, Allele.A, Allele.B, allele.frequency, id ,position, a0, a1} diff --git a/man/runASCAT.Rd b/man/runASCAT.Rd deleted file mode 100644 index acbb14a..0000000 --- a/man/runASCAT.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clonal_ascat.R -\name{runASCAT} -\alias{runASCAT} -\title{A modified ASCAT main function to fit Battenberg} -\usage{ -runASCAT(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, - distancepng = NA, copynumberprofilespng = NA, nonroundedprofilepng = NA, - cnaStatusFile = "copynumber_solution_status.txt", gamma = 0.55, - allow100percent, reliabilityFile = NA, min.ploidy = 1.6, - max.ploidy = 4.8, min.rho = 0.1, max.rho = 1, min.goodness = 63, - uninformative_BAF_threshold = 0.51, chr.names) -} -\arguments{ -\item{lrr}{(unsegmented) log R, in genomic sequence (all probes), with probe IDs} - -\item{baf}{(unsegmented) B Allele Frequency, in genomic sequence (all probes), with probe IDs} - -\item{lrrsegmented}{log R, segmented, in genomic sequence (all probes), with probe IDs} - -\item{bafsegmented}{B Allele Frequency, segmented, in genomic sequence (only probes heterozygous in germline), with probe IDs} - -\item{chromosomes}{a list containing c vectors, where c is the number of chromosomes and every vector contains all probe numbers per chromosome} - -\item{dist_choice}{The distance metric to be used internally to penalise a copy number solution} - -\item{distancepng}{if NA: distance is plotted, if filename is given, the plot is written to a .png file (Default NA)} - -\item{copynumberprofilespng}{if NA: possible copy number profiles are plotted, if filename is given, the plot is written to a .png file (Default NA)} - -\item{nonroundedprofilepng}{if NA: copy number profile before rounding is plotted (total copy number as well as the copy number of the minor allele), if filename is given, the plot is written to a .png file (Default NA)} - -\item{cnaStatusFile}{File where the copy number profile status is written to. This contains either the message "No suitable copy number solution found" or "X copy number solutions found" (Default copynumber_solution_status.txt)} - -\item{gamma}{technology parameter, compaction of Log R profiles (expected decrease in case of deletion in diploid sample, 100 "\%" aberrant cells; 1 in ideal case, 0.55 of Illumina 109K arrays) (Default 0.55)} - -\item{allow100percent}{A boolean whether to allow a 100"\%" cellularity solution} - -\item{reliabilityFile}{String to where fit reliabilty information should be written. This file contains backtransformed BAF and LogR values for segments using the fitted copy number profile (Default NA)} - -\item{min.ploidy}{The minimum ploidy to consider (Default 1.6)} - -\item{max.ploidy}{The maximum ploidy to consider (Default 4.8)} - -\item{min.rho}{The minimum cellularity to consider (Default 0.1)} - -\item{max.rho}{The maximum cellularity to consider (Default 1.0)} - -\item{min.goodness}{The minimum goodness of fit for a solution to have to be considered (Default 63)} - -\item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative (Default 0.51)} - -\item{chr.names}{A vector with chromosome names used for plotting} -} -\value{ -A list with fields psi, rho and ploidy -} -\description{ -This function returns an initial rho and psi estimate for a clonal copy number fit. It uses an internal distance metric to create a distance matrix. -Using that matrix it will search for a rho and psi combination that yields the least heavy penalty. -} diff --git a/man/run_clonal_ASCAT.Rd b/man/run_clonal_ASCAT.Rd deleted file mode 100644 index c8334e2..0000000 --- a/man/run_clonal_ASCAT.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clonal_ascat.R -\name{run_clonal_ASCAT} -\alias{run_clonal_ASCAT} -\title{ASCAT like function to obtain a clonal copy number profile} -\usage{ -run_clonal_ASCAT(lrr, baf, lrrsegmented, bafsegmented, chromosomes, - segBAF.table, input_optimum_pair, dist_choice, distancepng = NA, - copynumberprofilespng = NA, nonroundedprofilepng = NA, gamma_param, - read_depth, uninformative_BAF_threshold, allow100percent, - reliabilityFile = NA, psi_min_initial = 1, psi_max_initial = 5.4, - rho_min_initial = 0.1, rho_max_initial = 1.05, chr.names) -} -\arguments{ -\item{lrr}{(unsegmented) log R, in genomic sequence (all probes), with probe IDs} - -\item{baf}{(unsegmented) B Allele Frequency, in genomic sequence (all probes), with probe IDs} - -\item{lrrsegmented}{log R, segmented, in genomic sequence (all probes), with probe IDs} - -\item{bafsegmented}{B Allele Frequency, segmented, in genomic sequence (only probes heterozygous in germline), with probe IDs} - -\item{chromosomes}{a list containing c vectors, where c is the number of chromosomes and every vector contains all probe numbers per chromosome} - -\item{segBAF.table}{Segmented BAF data.frame from \code{get_segment_info}} - -\item{input_optimum_pair}{A list containing fields for rho, psi and ploidy, as is output from \code{runASCAT}} - -\item{dist_choice}{The distance metric to be used internally to penalise a copy number solution} - -\item{distancepng}{if NA: distance is plotted, if filename is given, the plot is written to a .png file (Default NA)} - -\item{copynumberprofilespng}{if NA: possible copy number profiles are plotted, if filename is given, the plot is written to a .png file (Default NA)} - -\item{nonroundedprofilepng}{if NA: copy number profile before rounding is plotted (total copy number as well as the copy number of the minor allele), if filename is given, the plot is written to a .png file (Default NA)} - -\item{gamma_param}{technology parameter, compaction of Log R profiles (expected decrease in case of deletion in diploid sample, 100 "\%" aberrant cells; 1 in ideal case, 0.55 of Illumina 109K arrays) (Default 0.55)} - -\item{read_depth}{TODO: unused parameter that should be removed} - -\item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative} - -\item{allow100percent}{A boolean whether to allow a 100"\%" cellularity solution} - -\item{reliabilityFile}{String to where fit reliabilty information should be written. This file contains backtransformed BAF and LogR values for segments using the fitted copy number profile (Default NA)} - -\item{psi_min_initial}{Minimum psi value to be considered (Default: 1.0)} - -\item{psi_max_initial}{Maximum psi value to be considered (Default: 5.4)} - -\item{rho_min_initial}{Minimum rho value to be considered (Default: 0.1)} - -\item{rho_max_initial}{Maximum rho value to be considered (Default: 1.05)} - -\item{chr.names}{A vector with chromosome names used for plotting} -} -\value{ -A list with fields output_optimum_pair, output_optimum_pair_without_ref, distance, distance_without_ref, minimise and is.ref.better -} -\description{ -This function takes an initial optimum rho/psi pair and uses -an internal distance metric to calculate a score for each rho/psi pair allowed. -The solution with the best score is then taken to obtain a global copy number -profile. This function performs both a grid search and tries to find a reference -segment, but the grid search result is always used for now. -} diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd index 3e727eb..64519c1 100644 --- a/man/run_haplotyping.Rd +++ b/man/run_haplotyping.Rd @@ -4,9 +4,19 @@ \alias{run_haplotyping} \title{Construct haplotypes for a chromosome} \usage{ -run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile, - problemloci, impute_exe, min_normal_depth, chrom_names, - snp6_reference_info_file = NA, heterozygousFilter = NA) +run_haplotyping( + chrom, + tumourname, + normalname, + ismale, + imputeinfofile, + problemloci, + impute_exe, + min_normal_depth, + chrom_names, + snp6_reference_info_file = NA, + heterozygousFilter = NA +) } \arguments{ \item{chrom}{The chromosome for which to reconstruct haplotypes} diff --git a/man/segment.baf.phased.Rd b/man/segment.baf.phased.Rd index 4e86f18..fda8c83 100644 --- a/man/segment.baf.phased.Rd +++ b/man/segment.baf.phased.Rd @@ -2,10 +2,20 @@ % Please edit documentation in R/segmentation.R \name{segment.baf.phased} \alias{segment.baf.phased} -\title{Segment the haplotyped and phased data using fastPCF.} +\title{Segment BAF, with the possible inclusion of structural variant breakpoints} \usage{ -segment.baf.phased(samplename, inputfile, outputfile, gamma = 10, - phasegamma = 3, kmin = 3, phasekmin = 3, calc_seg_baf_option = 1) +segment.baf.phased( + samplename, + inputfile, + outputfile, + prior_breakpoints_file = NULL, + gamma = 10, + phasegamma = 3, + kmin = 3, + phasekmin = 3, + no_segmentation = F, + calc_seg_baf_option = 3 +) } \arguments{ \item{samplename}{Name of the sample, which is used to name output figures} @@ -14,24 +24,24 @@ segment.baf.phased(samplename, inputfile, outputfile, gamma = 10, \item{outputfile}{String where the segmentation output will be written} -\item{gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)} +\item{prior_breakpoints_file}{String that points to a file with prior breakpoints (from SVs for example) with chromosome and position columns (Default: NULL)} -\item{phasegamma}{Gamma parameter used when correcting phasing mistakes (Default: 3)} +\item{gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default 10)} -\item{kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)} +\item{phasegamma}{Gamma parameter used when correcting phasing mistakes (Default 3)} -\item{phasekmin}{Kmin parameter used when correcting phasing mistakes (Default: 3)} +\item{kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default 3)} -\item{calc_seg_baf_option}{Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1)} +\item{phasekmin}{Kmin parameter used when correcting phasing mistakes (Default 3)} + +\item{no_segmentation}{Do not perform segmentation. This step will switch the haplotype blocks, but then just takes the mean BAFphased as BAFsegm} + +\item{calc_seg_baf_option}{Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean, 3 - ifelse median==0 or 1, median, mean. (Default: 3)} } \description{ -This function performs segmentation. This is done in two steps. First a segmentation step -that aims to find short segments. These are used to find haplotype blocks that have been -switched. These blocks are switched into the correct order first after which the second -segmentation step is performed. This second step aims to segment the data that will go into -fit.copy.number. This function produces a BAF segmented file with 5 columns: chromosome, position, -original BAF, switched BAF and BAF segment. The BAF segment column should be used subsequently +This function breaks the genome up into chromosomes, possibly further when SV breakpoints +are provided, and runs PCF on each to segment the chromosomes independently. } \author{ -dw9 +sd11 } diff --git a/man/segment.baf.phased.legacy.Rd b/man/segment.baf.phased.legacy.Rd new file mode 100644 index 0000000..b89d886 --- /dev/null +++ b/man/segment.baf.phased.legacy.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/segmentation.R +\name{segment.baf.phased.legacy} +\alias{segment.baf.phased.legacy} +\title{Segment the haplotyped and phased data using fastPCF. This is the legacy segmentation function as it was used in the original Battenberg versions} +\usage{ +segment.baf.phased.legacy( + samplename, + inputfile, + outputfile, + gamma = 10, + phasegamma = 3, + kmin = 3, + phasekmin = 3 +) + +segment.baf.phased.legacy( + samplename, + inputfile, + outputfile, + gamma = 10, + phasegamma = 3, + kmin = 3, + phasekmin = 3 +) +} +\arguments{ +\item{samplename}{Name of the sample, which is used to name output figures} + +\item{inputfile}{String that points to the output from the \code{combine.baf.files} function. This contains the phased SNPs with their BAF values} + +\item{outputfile}{String where the segmentation output will be written} + +\item{gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)} + +\item{phasegamma}{Gamma parameter used when correcting phasing mistakes (Default: 3)} + +\item{kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)} + +\item{phasekmin}{Kmin parameter used when correcting phasing mistakes (Default: 3)} + +\item{calc_seg_baf_option}{Various options to recalculate the BAF of a segment. Options are: 1 - median, 2 - mean. (Default: 1)} +} +\description{ +This function performs segmentation. This is done in two steps. First a segmentation step +that aims to find short segments. These are used to find haplotype blocks that have been +switched. These blocks are switched into the correct order first after which the second +segmentation step is performed. This second step aims to segment the data that will go into +fit.copy.number. This function produces a BAF segmented file with 5 columns: chromosome, position, +original BAF, switched BAF and BAF segment. The BAF segment column should be used subsequently + +This function performs segmentation. This is done in two steps. First a segmentation step +that aims to find short segments. These are used to find haplotype blocks that have been +switched. These blocks are switched into the correct order first after which the second +segmentation step is performed. This second step aims to segment the data that will go into +fit.copy.number. This function produces a BAF segmented file with 5 columns: chromosome, position, +original BAF, switched BAF and BAF segment. The BAF segment column should be used subsequently +} +\author{ +dw9 + +dw9 +} diff --git a/man/segment.baf.phased.sv.Rd b/man/segment.baf.phased.sv.Rd index 8f5b02c..814dae3 100644 --- a/man/segment.baf.phased.sv.Rd +++ b/man/segment.baf.phased.sv.Rd @@ -2,11 +2,20 @@ % Please edit documentation in R/segmentation.R \name{segment.baf.phased.sv} \alias{segment.baf.phased.sv} -\title{Segment BAF with the inclusion of structural variant breakpoints} +\title{Segment BAF with the inclusion of structural variant breakpoints - This function is now deprecated, call segment.baf.phased instead} \usage{ -segment.baf.phased.sv(samplename, inputfile, outputfile, svs, gamma = 10, - phasegamma = 3, kmin = 3, phasekmin = 3, no_segmentation = F, - calc_seg_baf_option = 1) +segment.baf.phased.sv( + samplename, + inputfile, + outputfile, + svs = NULL, + gamma = 10, + phasegamma = 3, + kmin = 3, + phasekmin = 3, + no_segmentation = F, + calc_seg_baf_option = 1 +) } \arguments{ \item{samplename}{Name of the sample, which is used to name output figures} @@ -15,7 +24,7 @@ segment.baf.phased.sv(samplename, inputfile, outputfile, svs, gamma = 10, \item{outputfile}{String where the segmentation output will be written} -\item{svs}{Data.frame with chromosome and position columns} +\item{svs}{Data.frame with chromosome and position columns (Default: NULL)} \item{gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default 10)} diff --git a/man/squaresplot.Rd b/man/squaresplot.Rd index e1859ae..15b11f8 100644 --- a/man/squaresplot.Rd +++ b/man/squaresplot.Rd @@ -4,8 +4,16 @@ \alias{squaresplot} \title{Plot Battenberg copy number solutions for a segment} \usage{ -squaresplot(tumourname, run_dir, segment_chr, segment_pos, platform_gamma = 1, - pdf = 0, binwidth_baf = 0.25, xylimits = c(-0.2, 5)) +squaresplot( + tumourname, + run_dir, + segment_chr, + segment_pos, + platform_gamma = 1, + pdf = 0, + binwidth_baf = 0.25, + xylimits = c(-0.2, 5) +) } \arguments{ \item{tumourname}{Sample name} diff --git a/man/suggest_refit.Rd b/man/suggest_refit.Rd index 1c76a25..a42e3f0 100644 --- a/man/suggest_refit.Rd +++ b/man/suggest_refit.Rd @@ -4,8 +4,15 @@ \alias{suggest_refit} \title{Calculate refit values from a refit suggestion} \usage{ -suggest_refit(subclones_file, segment_chrom, segment_pos, new_nMaj, new_nMin, - rho, gamma_param) +suggest_refit( + subclones_file, + segment_chrom, + segment_pos, + new_nMaj, + new_nMin, + rho, + gamma_param +) } \arguments{ \item{subclones_file}{A Battenberg subclones.txt file}