From 34f5d972b9fb10500564655bc925a05992010ac4 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 17 Oct 2015 16:10:24 +0100 Subject: [PATCH] Updating docs --- R/interconvertMutationBurdens.R | 4 +++ R/preprocessing.R | 31 ++++++++++++++++++++--- inst/example/preproc_pipeline_simple.R | 20 +++++++++------ man/alleleCount.Rd | 2 +- man/concat_files.Rd | 3 +-- man/dpIn2vcf.Rd | 3 +-- man/getAlleleCounts.Sanger.Rd | 2 +- man/getCountsNormal.Rd | 2 +- man/getCountsTumour.Rd | 2 +- man/mut_cn_phasing.Rd | 4 +-- man/mut_mut_phasing.Rd | 2 +- man/mutationBurdenToMutationCopyNumber.Rd | 2 +- man/mutationCopyNumberToMutationBurden.Rd | 2 +- man/parseFai.Rd | 3 +-- man/parseIgnore.Rd | 3 +-- man/runGetDirichletProcessInfo.Rd | 7 +++-- man/split_by_chrom.Rd | 4 +-- man/vcf2loci.Rd | 2 +- 18 files changed, 61 insertions(+), 37 deletions(-) diff --git a/R/interconvertMutationBurdens.R b/R/interconvertMutationBurdens.R index 85b97eb..399a61a 100755 --- a/R/interconvertMutationBurdens.R +++ b/R/interconvertMutationBurdens.R @@ -1,3 +1,5 @@ +#' Mutation burden to mutation copy number +#' #' Function to convert mutation burdens into mutation copy number #' @param burden A vector containing mutation burdens #' @param totalCopyNumber A vector with total tumour copynumber @@ -12,6 +14,8 @@ mutationBurdenToMutationCopyNumber = function(burden, totalCopyNumber, cellulari return(mutCopyNumber) } +#' Mutation copy number to mutation burden +#' #' Function to convert mutation copy number to mutation burden #' @param copyNumber A vector containing mutation copy number #' @param totalCopyNumber A vector with total tumour copynumber diff --git a/R/preprocessing.R b/R/preprocessing.R index 17f0aa5..b7fe346 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -1,6 +1,8 @@ ALLELECOUNTER = "alleleCounter" LINKAGEPULL = "Linkage_pull.pl" +#' Concatenate split files +#' #' Convenience function to concatenate a series of files specified in a file of file names. #' This function assumes all files have the same layout. #' @param fofn A file of file names to be concatenated @@ -27,7 +29,8 @@ concat_files = function(fofn, inputdir, outfile, haveHeader) { write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F) } - +#' Split a file per chromosome +#' #' Convenience function to split an input file per chromosome. All it requires is that #' the infile has as first column chromosome specification. The output files will be named #' outdir/prefixCHROMNUMBERpostfix @@ -55,6 +58,8 @@ split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) { ############################################ # VCF 2 LOCI ############################################ +#' Parse genome index +#' #' Convenience function that parses a reference genome index as generated #' by samtools index #' @param fai_file The index @@ -67,6 +72,8 @@ parseFai = function(fai_file) { return(fai) } +#' Parse chromosomes to ignore file +#' #' Convenience function that parses an ignore file. This file #' is expected to have a single column with just chromosome names #' @param ignore_file The file specifying to be ignored chromosomes @@ -79,6 +86,8 @@ parseIgnore = function(ignore_file) { return(ign) } +#' Transform vcf to loci file +#' #' Function that dumps the loci of snvs from a series of vcf files into a single loci file #' @param vcf_files A vector of vcf files to be considered #' @param fai_file Reference genome index @@ -110,6 +119,8 @@ vcf2loci = function(vcf_files, fai_file, ign_file, outfile) { ############################################ # Allele counting ############################################ +#' Run alleleCount +#' #' Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH #' @param locifile A file with at least chromsome and position columns of the locations to be counted #' @param bam A bam file @@ -161,6 +172,8 @@ formatOutput = function(counts_table, v) { return(output) } +#' Dump allele counts from vcf for normal +#' #' Returns an allele counts table for the normal sample #' @param v The vcf file #' @param centre The sequencing centre of which pipeline the vcf file originates @@ -174,6 +187,8 @@ getCountsNormal = function(v, centre="sanger") { return(getAlleleCounts.Sanger(v, 1)) } +#' Dump allele counts from vcf for tumour +#' #' Returns an allele counts table for the tumour sample #' @param v The vcf file #' @param centre The sequencing centre of which pipeline the vcf file originates @@ -187,6 +202,8 @@ getCountsTumour = function(v, centre="sanger") { return(getAlleleCounts.Sanger(v, 2)) } +#' Dump allele counts from Sanger pipeline vcf +#' #' Helper function that dumps the allele counts from a Sanger pipeline VCF file #' @param v The vcf file #' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 @@ -218,6 +235,8 @@ run_linkage_pull_mut = function(output, loci_file, bam_file, bai_file) { return(count.data) } +#' Phase mutation to mutation +#' #' Run mutation to mutation phasing. This function requires the Linkage_pull.pl script in $PATH. #' @param loci_file A list of loci #' @param phased_file File to save the output @@ -311,6 +330,8 @@ run_linkage_pull_snp = function(loci_file, bam_file, bai_file, chr, pos1, ref1, return(linked.muts) } +#' Phase mutation to SNP/copy number +#' #' Run mutation to copy number phasing. This function requires the Linkage_pull.pl script in $PATH. #' Note: This function should either be run separately per chromosome and then combined with \code{\link{concat_files}} #' or on all chromsomes in one go, but then the _allHaplotypeInfo.txt Battenberg files need to be concatenated first. @@ -626,6 +647,8 @@ GetWTandMutCount <- function(loci_file, allele_frequencies_file) { ############################################## # GetDirichletProcessInfo ############################################## +#' Create the DPClust input file +#' #' Function that takes allele counts and a copy number profile to estimate mutation copy number, #' cancer cell fraction and multiplicity for each point mutation. #' @param loci_file Simple four column file with chromosome, position, reference allele and alternative allele @@ -633,8 +656,8 @@ GetWTandMutCount <- function(loci_file, allele_frequencies_file) { #' @param cellularity_file Full path to a Battenberg rho_and_psi output file #' @param subclone_file Full path to a Battenberg subclones.txt output file #' @param gender Specify male or female -#' @param SNP.phase.file Output file from mut_mut_phasing -#' @param mut.phase.file Output file from mut_cn_phasing +#' @param SNP.phase.file Output file from mut_mut_phasing, supply NA (as char) when not available +#' @param mut.phase.file Output file from mut_cn_phasing, supply NA (as char) when not available #' @param output_file Name of the output file #' @author sd11 #' @export @@ -654,6 +677,8 @@ runGetDirichletProcessInfo = function(loci_file, allele_frequencies_file, cellul ############################################## # dpIn to VCF ############################################## +#' DPClust input file to vcf +#' #' Transform a dirichlet input file into a VCF with the same info. It filters out mutations in areas that are not contained in the supplied genome index (fai file) or are contained in the ignore file (ign file) #' It takes the DP input file created by runGetDirichletProcessInfo and combines the columns with the vcf file supplied. Finally it gzips and indexes the file #' @param vcf_infile Filename of the VCF file to use as a base diff --git a/inst/example/preproc_pipeline_simple.R b/inst/example/preproc_pipeline_simple.R index 4f97e24..eb2078f 100644 --- a/inst/example/preproc_pipeline_simple.R +++ b/inst/example/preproc_pipeline_simple.R @@ -15,19 +15,23 @@ vcf_file = toString(args[3]) # Full path to the vcf file with SNV calls. All cal rho_and_psi_file = toString(args[4]) # Full path to a rho_and_psi output file from Battenberg subclones_file = toString(args[5]) # Full path to the subclones output file from Battenberg sex = toString(args[6]) # Specify male or female -output_dir = toString(args[7]) # Full path to where the output must be written +output_dir = toString(args[7]) # Full path to where the output should be written fai_file = toString(args[8]) # Full path to the reference genome index file used for this sample ign_file = toString(args[9]) # Full path to simple list of chromosome names to ignore (must contain at least Y and MT) -# Define various files +library(dpclust3p) + +# Define the final output file +dpoutput_file = paste(output_dir, "/", samplename, "_allDirichletProcessInfo.txt", sep="") + +# Define various temp files loci_file = paste(output_dir, "/", samplename, "_loci.txt", sep="") allelecounts_file = paste(output_dir, "/", samplename, "_alleleFrequencies.txt", sep="") -dpoutput_file = paste(output_dir, "/", samplename, "_allDirichletProcessInfo.txt", sep="") # Dump loci - this function can take multiple vcf files when multiple samples from same donor -vcf2loci(vcf_file=vcf_file, fai_file=fai_file, ign_file=ign_file, loci_file=loci_file) +vcf2loci(vcf_file=vcf_file, fai_file=fai_file, ign_file=ign_file, outfile=loci_file) -# Count alleles +# Fetch allele counts alleleCount(locifile=loci_file, bam=bam_file, outfile=allelecounts_file, min_baq=20, min_maq=35) # Create dpIn file @@ -36,6 +40,6 @@ runGetDirichletProcessInfo(loci_file=loci_file, cellularity_file=rho_and_psi_file, subclone_file=subclones_file, gender=sex, - SNP.phase.file=NA, - mut.phase.file=NA, - output_file=dpoutput_file) \ No newline at end of file + SNP.phase.file="NA", + mut.phase.file="NA", + output_file=dpoutput_file) diff --git a/man/alleleCount.Rd b/man/alleleCount.Rd index 455c74f..bf4ab48 100644 --- a/man/alleleCount.Rd +++ b/man/alleleCount.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{alleleCount} \alias{alleleCount} -\title{Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH} +\title{Run alleleCount} \usage{ alleleCount(locifile, bam, outfile, min_baq = 20, min_maq = 35) } diff --git a/man/concat_files.Rd b/man/concat_files.Rd index 5bb9eff..64dd53a 100644 --- a/man/concat_files.Rd +++ b/man/concat_files.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{concat_files} \alias{concat_files} -\title{Convenience function to concatenate a series of files specified in a file of file names. -This function assumes all files have the same layout.} +\title{Concatenate split files} \usage{ concat_files(fofn, inputdir, outfile, haveHeader) } diff --git a/man/dpIn2vcf.Rd b/man/dpIn2vcf.Rd index 08db5cd..b4854a1 100644 --- a/man/dpIn2vcf.Rd +++ b/man/dpIn2vcf.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{dpIn2vcf} \alias{dpIn2vcf} -\title{Transform a dirichlet input file into a VCF with the same info. It filters out mutations in areas that are not contained in the supplied genome index (fai file) or are contained in the ignore file (ign file) -It takes the DP input file created by runGetDirichletProcessInfo and combines the columns with the vcf file supplied. Finally it gzips and indexes the file} +\title{DPClust input file to vcf} \usage{ dpIn2vcf(vcf_infile, dpIn_file, vcf_outfile, fai_file, ign_file, genome = "hg19") diff --git a/man/getAlleleCounts.Sanger.Rd b/man/getAlleleCounts.Sanger.Rd index e13c7bc..0b023e1 100644 --- a/man/getAlleleCounts.Sanger.Rd +++ b/man/getAlleleCounts.Sanger.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{getAlleleCounts.Sanger} \alias{getAlleleCounts.Sanger} -\title{Helper function that dumps the allele counts from a Sanger pipeline VCF file} +\title{Dump allele counts from Sanger pipeline vcf} \usage{ getAlleleCounts.Sanger(v, sample_col) } diff --git a/man/getCountsNormal.Rd b/man/getCountsNormal.Rd index bd83936..5608d22 100644 --- a/man/getCountsNormal.Rd +++ b/man/getCountsNormal.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{getCountsNormal} \alias{getCountsNormal} -\title{Returns an allele counts table for the normal sample} +\title{Dump allele counts from vcf for normal} \usage{ getCountsNormal(v, centre = "sanger") } diff --git a/man/getCountsTumour.Rd b/man/getCountsTumour.Rd index d8cbc5e..9835e9e 100644 --- a/man/getCountsTumour.Rd +++ b/man/getCountsTumour.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{getCountsTumour} \alias{getCountsTumour} -\title{Returns an allele counts table for the tumour sample} +\title{Dump allele counts from vcf for tumour} \usage{ getCountsTumour(v, centre = "sanger") } diff --git a/man/mut_cn_phasing.Rd b/man/mut_cn_phasing.Rd index ceb9c9f..c8d56ee 100644 --- a/man/mut_cn_phasing.Rd +++ b/man/mut_cn_phasing.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{mut_cn_phasing} \alias{mut_cn_phasing} -\title{Run mutation to copy number phasing. This function requires the Linkage_pull.pl script in $PATH. -Note: This function should either be run separately per chromosome and then combined with \code{\link{concat_files}} -or on all chromsomes in one go, but then the _allHaplotypeInfo.txt Battenberg files need to be concatenated first.} +\title{Phase mutation to SNP/copy number} \usage{ mut_cn_phasing(loci_file, phased_file, hap_file, bam_file, bai_file, outfile, max_distance) diff --git a/man/mut_mut_phasing.Rd b/man/mut_mut_phasing.Rd index 09bf1df..84c1051 100644 --- a/man/mut_mut_phasing.Rd +++ b/man/mut_mut_phasing.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{mut_mut_phasing} \alias{mut_mut_phasing} -\title{Run mutation to mutation phasing. This function requires the Linkage_pull.pl script in $PATH.} +\title{Phase mutation to mutation} \usage{ mut_mut_phasing(loci_file, phased_file, bam_file, bai_file, max_distance) } diff --git a/man/mutationBurdenToMutationCopyNumber.Rd b/man/mutationBurdenToMutationCopyNumber.Rd index 5d836ca..976ea46 100644 --- a/man/mutationBurdenToMutationCopyNumber.Rd +++ b/man/mutationBurdenToMutationCopyNumber.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/interconvertMutationBurdens.R \name{mutationBurdenToMutationCopyNumber} \alias{mutationBurdenToMutationCopyNumber} -\title{Function to convert mutation burdens into mutation copy number} +\title{Mutation burden to mutation copy number} \usage{ mutationBurdenToMutationCopyNumber(burden, totalCopyNumber, cellularity, normalCopyNumber = rep(2, length(burden))) diff --git a/man/mutationCopyNumberToMutationBurden.Rd b/man/mutationCopyNumberToMutationBurden.Rd index 898462b..15daeb0 100644 --- a/man/mutationCopyNumberToMutationBurden.Rd +++ b/man/mutationCopyNumberToMutationBurden.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/interconvertMutationBurdens.R \name{mutationCopyNumberToMutationBurden} \alias{mutationCopyNumberToMutationBurden} -\title{Function to convert mutation copy number to mutation burden} +\title{Mutation copy number to mutation burden} \usage{ mutationCopyNumberToMutationBurden(copyNumber, totalCopyNumber, cellularity, normalCopyNumber = rep(2, length(copyNumber))) diff --git a/man/parseFai.Rd b/man/parseFai.Rd index 853b0af..86e5c0a 100644 --- a/man/parseFai.Rd +++ b/man/parseFai.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{parseFai} \alias{parseFai} -\title{Convenience function that parses a reference genome index as generated -by samtools index} +\title{Parse genome index} \usage{ parseFai(fai_file) } diff --git a/man/parseIgnore.Rd b/man/parseIgnore.Rd index ff05e16..4356b89 100644 --- a/man/parseIgnore.Rd +++ b/man/parseIgnore.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{parseIgnore} \alias{parseIgnore} -\title{Convenience function that parses an ignore file. This file -is expected to have a single column with just chromosome names} +\title{Parse chromosomes to ignore file} \usage{ parseIgnore(ignore_file) } diff --git a/man/runGetDirichletProcessInfo.Rd b/man/runGetDirichletProcessInfo.Rd index eeb5bcb..5eae17f 100644 --- a/man/runGetDirichletProcessInfo.Rd +++ b/man/runGetDirichletProcessInfo.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{runGetDirichletProcessInfo} \alias{runGetDirichletProcessInfo} -\title{Function that takes allele counts and a copy number profile to estimate mutation copy number, -cancer cell fraction and multiplicity for each point mutation.} +\title{Create the DPClust input file} \usage{ runGetDirichletProcessInfo(loci_file, allele_frequencies_file, cellularity_file, subclone_file, gender, SNP.phase.file, mut.phase.file, output_file) @@ -19,9 +18,9 @@ runGetDirichletProcessInfo(loci_file, allele_frequencies_file, cellularity_file, \item{gender}{Specify male or female} -\item{SNP.phase.file}{Output file from mut_mut_phasing} +\item{SNP.phase.file}{Output file from mut_mut_phasing, supply NA (as char) when not available} -\item{mut.phase.file}{Output file from mut_cn_phasing} +\item{mut.phase.file}{Output file from mut_cn_phasing, supply NA (as char) when not available} \item{output_file}{Name of the output file} } diff --git a/man/split_by_chrom.Rd b/man/split_by_chrom.Rd index 6463e53..788521f 100644 --- a/man/split_by_chrom.Rd +++ b/man/split_by_chrom.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{split_by_chrom} \alias{split_by_chrom} -\title{Convenience function to split an input file per chromosome. All it requires is that -the infile has as first column chromosome specification. The output files will be named -outdir/prefixCHROMNUMBERpostfix} +\title{Split a file per chromosome} \usage{ split_by_chrom(infile, prefix, postfix, outdir, chrom_file) } diff --git a/man/vcf2loci.Rd b/man/vcf2loci.Rd index c863877..5d3d141 100644 --- a/man/vcf2loci.Rd +++ b/man/vcf2loci.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/preprocessing.R \name{vcf2loci} \alias{vcf2loci} -\title{Function that dumps the loci of snvs from a series of vcf files into a single loci file} +\title{Transform vcf to loci file} \usage{ vcf2loci(vcf_files, fai_file, ign_file, outfile) }