diff --git a/README.md b/README.md index f8d779b..6058b89 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,16 @@ After installing, SAVANA can be run on long-read data with a minumum set of argu savana --tumour --normal --outdir --ref ``` -This will call somatic SVs. We recommend running with the --contigs argument (`--contigs example/contigs.chr.hg38.txt`) to only examine chromosomes of interest. To compute copy number aberrations, you must provide a phased VCF for the germline sample (generated using whatshap - see [Generating Phased VCF](#generating-phased-vcf)). Then, to call both SVs and CNAs you can run savana with: +This will call somatic SVs. We recommend running with the --contigs argument (`--contigs example/contigs.chr.hg38.txt`) to only examine chromosomes of interest. To compute copy number aberrations, you must provide a phased VCF for the germline sample (see [Generating Phased VCF](#generating-phased-vcf)). Then, to call both SVs and CNAs you can run savana with: ``` savana --tumour --normal --outdir --ref --phased_vcf --blacklist ``` -> Note, that if you do not want to use a blacklist to compute copy number aberrations, you will have to specify the `--no_blacklist` flag instead. +Note, that if you do not want to use a blacklist to compute copy number aberrations, you will have to specify the `--no_blacklist` flag instead. +Additionally, if you have already generated the heterozygous SNP allele counts using the above command, you can skip this step by providing the file instead of the phased VCF using the `--allele_counts_het_snps` flag. +Example command to run savana for both of the above: +``` +savana --tumour --normal --outdir --ref --allele_counts_het_snps --no_blacklist +``` ### Quickstart @@ -142,7 +147,7 @@ Argument|Description --cna_resuce| Copy number abberation output file for this sample (used to rescue variants) --cna_rescue_distance| Maximum distance from a copy number abberation for a variant to be rescued by it --threads| Number of threads to use (default is maximum available) ---cna_threads| Number of threads to use for CNA calling (default is to use `--threads`) +--cna_threads| Number of threads to use for CNA calling (default is maximum available) --ref_index| Full path to reference genome fasta index (ref path + ".fai" used by default) --single_bnd| Report single breakend variants in addition to standard types (False by default) --single_bnd_min_length| Minimum length of single breakend to consider (default=100) @@ -155,20 +160,22 @@ Argument|Description --coverage_binsize | Length used for coverage bins (default=5) --chunksize | Chunksize to use when splitting genome for parallel analysis (default=1000000) *CNA Algorithm Arguments* +--tmpdir| Temp directory for allele counting temp files (defaults to outdir) --allele_counts_het_snps| If allele counting has already been performed provide the path for the allele counts of heterozygous SNPs to skip this step ---allele_mapq| Mapping quality threshold for reads to be included in the allele counting (default = 0) +--allele_mapq| Mapping quality threshold for reads to be included in the allele counting (default = 5) --allele_min_reads| Minimum number of reads required per het SNP site for allele counting (default = 10) +--ac_window| Window size for allele counting to parallelise (default = 1000000) --cn_binsize| Bin window size in kbp (default=10) --blacklist| Path to the blacklist file --breakpoints| Path to SAVANA VCF file to incorporate savana breakpoints into copy number analysis ---chromosomes| Chromosomes to analyse. To run on all chromosomes leave unspecified (default). To run on a subset of chromosomes only specify the chromosome numbers separated by spaces. For x and y chromosomes, use 23 and 24, respectively. E.g. use "-c 1 4 23 24" to run chromosomes 1, 4, X and Y +--chromosomes| Chromosomes to analyse. To run on all chromosomes leave unspecified (default). To run on a subset of chromosomes only specify the chromosome numbers separated by spaces. For x and y chromosomes, use 23 and 24, respectively. E.g. use "--chromosomes 1 4 23 24" to run chromosomes 1, 4, X and Y --readcount_mapq| Mapping quality threshold for reads to be included in the read counting (default = 5) --no_blacklist| Don't use a blacklist --bl_threshold| Percentage overlap between bin and blacklist threshold to tolerate for read counting (default = 5). Please specify percentage threshold as integer, e.g. "-t 5". Set "-t 0" if no overlap with blacklist is to be tolerated --no_basesfilter| Do not filter bases --bases_threshold| Percentage of known bases per bin required for read counting (default = 75). Please specify percentage threshold as integer, e.g. "-bt 95" ---smoothing_level| Size of neighbourhood for smoothing ---trim| Trimming percentage to be used +--smoothing_level| Size of neighbourhood for smoothing (default = 10) +--trim| Trimming percentage to be used (0.025) --min_segment_size| Minimum size for a segement to be considered a segment (default = 5) --shuffles| Number of permutations (shuffles) to be performed during CBS (default = 1000) --p_seg| p-value used to test segmentation statistic for a given interval during CBS using (shuffles) number of permutations (default = 0.05) @@ -181,12 +188,14 @@ Argument|Description --max_cellularity| Maximum cellularity to be considered for copy number fitting. If hetSNPs allele counts are provided this is estimated during copy number fitting. Alternatively a purity value can be provided if the purity of the sample is already known --cellularity_step| Cellularity step size for grid search space used during for copy number fitting (default = 0.01) --cellularity_buffer| Cellularity buffer to define purity grid search space during copy number fitting (default = 0.1) +--overrule_cellularity| Set to sample`s purity if known. This value will overrule the cellularity estimated using hetSNP allele counts (not used by default) --distance_function| Distance function to be used for copy number fitting. choices=[RMSD, MAD] (default = RMSD) --distance_filter_scale_factor| Distance filter scale factor to only include solutions with distances < scale factor * min(distance) --distance_precision| Number of digits to round distance functions to (default = 3) --max_proportion_zero| Maximum proportion of fitted copy numbers to be tolerated in the zero or negative copy number state (default = 0.1) --min_proportion_close_to_whole_number| Minimum proportion of fitted copy numbers sufficiently close to whole number to be tolerated for a given fit (default = 0.5) --max_distance_from_whole_number| Distance from whole number for fitted value to be considered sufficiently close to nearest copy number integer (default = 0.25) +--main_cn_step_change| Max main copy number step change across genome to be considered for a given solution --min_ps_size| Minimum size (number of SNPs) for phaseset to be considered for purity estimation (default = 10) --min_ps_length| Minimum length (bps) for phaseset to be considered for purity estimation (default = 500000) *Additional Arguments* @@ -271,7 +280,7 @@ To enable the examination of inserted sequence, `{sample}.inserted_sequences.fa` ### Output Files CNA Algorithm #### Raw read counts TSV -`{sample}_{cn_binsize}_read_counts.tsv` contains all raw and unfiltered read counts for each bin across the reference genome for the tumour and matched normal sample. In addition, SAVANA also outputs other intermediate files during copy number processing, including the filtered read counts (`{sample}_{cn_binsize}_read_counts_filtered.tsv`) and the, if provided, matched-normal normalised log2 transformed read counts (`{sample}_{cn_binsize}_read_counts_mnorm_log2r.tsv`). +`{sample}_{cn_binsize}_raw_read_counts.tsv` contains all raw and unfiltered read counts for each bin across the reference genome for the tumour and matched normal sample. In addition, SAVANA also outputs other intermediate files during copy number processing, including the filtered and matched-normal normalised log2 transformed read counts (`{sample}_{cn_binsize}_read_counts_mnorm_log2r.tsv`). #### Segmented log2r relative copy number TSV `{sample}_{cn_binsize}_read_counts_mnorm_log2r_segmented.tsv` contains the final relative copy number (log2r) data post CBS segmentation. This includes the log2r relative copy number for each bin across the reference genome, as well as the segment IDs and according segmented log2r relative copy number values. @@ -285,7 +294,7 @@ The final and main SAVANA CNA output file is `{sample}_{cn_binsize}_segmented_ab ## Phasing Information ### Generating Phased VCF -We recommend using [WhatsHap](https://whatshap.readthedocs.io/en/stable/index.html) to generate phased VCF from matched normal samples. As an example, WhatsHap can be run using the following command: +We recommend using [LongPhase](https://github.com/twolinin/longphase) or [WhatsHap](https://whatshap.readthedocs.io/en/stable/index.html) to generate phased VCF from matched normal samples. As an example, WhatsHap can be run using the following command: ``` whatshap phase --ignore-read-groups -o --reference= @@ -295,7 +304,7 @@ Germline SNPs (``) can for example be obtained using [Clair3] ### Generating Phased BAMs -Again, we recommend using [WhatsHap](https://whatshap.readthedocs.io/en/stable/index.html) for tagging sequencing reads by haplotype to generate phased BAMs (see example code below) using the `` obtained in the previous step. +Again, we recommend using [LongPhase](https://github.com/twolinin/longphase) or [WhatsHap](https://whatshap.readthedocs.io/en/stable/index.html) for tagging sequencing reads by haplotype to generate phased BAMs (see example code below) using the `` obtained in the previous step. ``` whatshap haplotag --ignore-read-groups -o --reference && samtools index diff --git a/environment.yml b/environment.yml index 608ce23..c9bc6fb 100644 --- a/environment.yml +++ b/environment.yml @@ -4,9 +4,8 @@ channels: - conda-forge - bioconda - - defaults dependencies: - - python=3.9.6 + - python=3.9 - pybedtools=0.9.0 - pysam=0.21.0 - cyvcf2=0.30.16 diff --git a/savana/allele_counter.py b/savana/allele_counter.py index 3027aef..0252b8c 100644 --- a/savana/allele_counter.py +++ b/savana/allele_counter.py @@ -8,21 +8,21 @@ from multiprocessing import Pool from multiprocessing import cpu_count -import argparse -import timeit import cyvcf2 import pysam import pybedtools -import math import os +import glob +import math + -def extract_hets(phased_vcf): +def extract_hets(phased_vcf, window): ''' - Extracts heterozygous SNPs from phased.vcf (e.g. from matched normal bam). + Extracts heterozygous SNPs from phased.vcf (i.e. from matched normal bam). ''' - print(f" Extracting phased heterozygous SNPs from {phased_vcf} ...") + print(f" ... Extracting heterozygous SNPs from {phased_vcf} ... ") vcf_reader = cyvcf2.VCF(phased_vcf) - hets = [] + hets_dict = {} # iterate through variants for variant in vcf_reader: # check if variant is a SNP @@ -31,163 +31,237 @@ def extract_hets(phased_vcf): for s_idx, gt in enumerate(variant.genotypes): # only get heterozygous snps if gt[0] != gt[1]: - # print(s_idx, gt) - # sample = vcf_reader.samples[s_idx] ps = int(variant.format('PS')[s_idx]) if 'PS' in variant.FORMAT else None gt_str = f"{gt[0]}|{gt[1]}" if gt[2] == 1 else f"{gt[0]}/{gt[1]}" - # dp = int(variant.format('DP')[s_idx]) if 'DP' in variant.FORMAT else None - # ad = int(variant.format('AD')[s_idx]) if 'AD' in variant.FORMAT else None - var_out = [variant.CHROM,str(variant.POS),str(variant.POS),variant.REF,variant.ALT[0],gt_str,str(ps)] - hets.append(var_out) - return hets - -def chunkify_bed(bed_file, chunk_size): - ''' - Divides bed file into chunks based on threads available for multiprocessing. - ''' - chunks = [] - current_chunk = [] - for i, feature in enumerate(bed_file): - current_chunk.append(feature) - if (i + 1) % chunk_size == 0: - chunks.append(pybedtools.BedTool(current_chunk)) - current_chunk = [] - if current_chunk: - chunks.append(pybedtools.BedTool(current_chunk)) - return chunks + CHROM=variant.CHROM + key=f"{str(variant.POS)}_{str(variant.REF)}" + var_out = [variant.ALT[0],gt_str,str(ps)] + # nested dictionary - by chromosome and by position (in 100k increments) + pos_cat = math.floor(int(variant.POS)/window) * window + if (CHROM not in hets_dict): + hets_dict[CHROM] = {} + if (pos_cat not in hets_dict[CHROM]): + hets_dict[CHROM][pos_cat] = {} + hets_dict[CHROM][pos_cat][key] = var_out + elif (pos_cat not in hets_dict[CHROM]): + hets_dict[CHROM][pos_cat] = {} + hets_dict[CHROM][pos_cat][key] = var_out + else: + hets_dict[CHROM][pos_cat][key] = var_out + print(f" ... Heterozygous SNPs from {phased_vcf} extracted. Extracting allele counts for heterozygous SNPs ...") + return hets_dict -def allele_count(curr_chunk, sites, bam, allele_mapq, allele_min_reads): +def process_allele_counts(allele_counts_path, hets_dict, window): ''' - Count alleles across heterozygous SNP positions from phased normal bam + Extracts and process allele counts for heterozygous SNPs dictionary. ''' - print(f" Read counting {curr_chunk} ...") - # open tumour bam - with pysam.AlignmentFile(bam, "rb") as bamfile: - list_out = [] - for site in sites: - # print(site) - dict_out = {} - # if len(site[0])==len(chr_now): - # chr = site[0];#[3]; - # else: - # chr = site[0]#[3:5]; - # if chr != chr_now: - # continue - # chr2 61791980 61791980 G A 1|0 58907085 - chrom = site[0] - start = int(site[1])-1 - end = int(site[2]) # start and end cannot be the same position or otherwise the fetching does not work - ref = site[3] - alt = site[4] - GT = site[5] - PS = site[6] + ac_bed = pybedtools.BedTool(allele_counts_path) + bed_out = [] + for site in ac_bed: + chrom,pos,ref,dp= site[0], site[2], site[3], site[9] + bases = { 'A': site[4], 'C': site[5], 'G': site[6], 'T': site[7], 'N': site[8]} + key=f"{pos}_{ref}" + pos_cat = math.floor(int(pos)/window) * window # same key as above + # try: + # hets_dict[chrom][pos_cat][key] + # pass + # except: + # continue + if chrom not in hets_dict.keys(): + continue + if pos_cat not in hets_dict[chrom].keys(): + continue + if key not in hets_dict[chrom][pos_cat].keys(): + continue + key_out=hets_dict[chrom][pos_cat][key] + alt,gt,ps=key_out[0],key_out[1],key_out[2] + # estimate AFs + AF_0 = float(bases[ref]) / float(dp) + AF_1 = float(bases[alt]) / float(dp) + # define output list + out = [chrom, pos, pos, ref, alt, bases['A'],bases['C'],bases['G'],bases['T'],bases['N'], str(AF_0), str(AF_1), gt, ps] + bed_out.append(out) + return bed_out - reads = [ - read for read in bamfile.fetch( - chrom, - start, - end, - multiple_iterators=True - ) - ] - # filter reads - reads = [ - read for read in reads if - read.mapping_quality >= allele_mapq and not read.is_secondary and not read.is_supplementary - ] - # perform base counting - try: - if len(reads) >= allele_min_reads: - key_now = f"{chr}_{start}_{end}_{ref}_{alt}" - # key_now = "{}\t{}\t{}\t{}\t{}".format(chr, start, end, ref ,alt) - for read in reads: - read_sequence = read.seq +# the following five functions were adapted and taken from Fran Muyas +def collect_result(result): + if (result[1] != ''): + VARIANTS = result[1] + OUT = result[0] + out = open(OUT,'w') + out.write(VARIANTS) + out.close() - aligned_pos = read.get_reference_positions( - full_length=True - ) - try: - idx = aligned_pos.index(start) - #DP+=1 - snp_read = read_sequence[idx] - if key_now in dict_out: #.has_key(key_now): - dict_out[key_now][snp_read] = dict_out[key_now][snp_read] + 1 - else: - dict_out[key_now]={"A":0, "C":0, "G":0, "T":0, "N":0} - dict_out[key_now][snp_read] = dict_out[key_now][snp_read] + 1 - except: - continue - DP = dict_out[key_now]["A"] + dict_out[key_now]["C"] + dict_out[key_now]["G"] + dict_out[key_now]["T"] - if GT == "0|1": - AF_0 = dict_out[key_now][ref] / DP - AF_1 = dict_out[key_now][alt] / DP - if GT == "1|0": - AF_0 = dict_out[key_now][alt] / DP - AF_1 = dict_out[key_now][ref] / DP - #print("{} {}".format(AF_0, AF_1)) - dict_out[key_now]["AF_0"] = AF_0 - dict_out[key_now]["AF_1"] = AF_1 +def concatenate_sort_temp_files_and_write(out_file, tmp_dir): + # Get file paths of temp files + print(" ... Concatenating temp files into interim allele count results ... ") + all_files = glob.glob(tmp_dir + '/*.hetsnps_allelecounts.temp') + # Load temp files + if (len(all_files) > 0): + # Organise files in dictionaries of chromosomes and start positions + Dictionary_of_files = {} + for filename in all_files: + basename = os.path.basename(filename) + basename = basename.split(".") + # positions + coordinates = basename[-3] + CHROM, START, END = coordinates.split("__") + START = int(START) + if (CHROM not in Dictionary_of_files): + Dictionary_of_files[CHROM] = {} + Dictionary_of_files[CHROM][START] = filename + else: + Dictionary_of_files[CHROM][START] = filename + ## Write to final output file + out = open(out_file,'w') + # Header = ['#CHROM', 'Start', 'End','REF', 'A', 'C', 'G', 'T', 'N', 'DP'] + # Header = '\t'.join(Header) + '\n' + # out.write(Header) + # Move through filenames by sorted coordinates + for chrom in sorted(Dictionary_of_files.keys()): + for start in sorted(Dictionary_of_files[chrom].keys()): + filename = Dictionary_of_files[chrom][start] + with open(filename, 'r') as f: + out.write(f.read()) + out.write('\n') + # Remove temp file + os.remove(filename) + out.close() + else: + # If temp files not found, print message + print ('No temporary files found') - out = [chrom, str(start), str(end), ref, alt, str(dict_out[key_now]["A"]), str(dict_out[key_now]["C"]), str(dict_out[key_now]["G"]), str(dict_out[key_now]["T"]), str(dict_out[key_now]["N"]) , str(dict_out[key_now]["AF_0"]), str(dict_out[key_now]["AF_1"]), GT, str(PS)] +def MakeWindows(CONTIG, FASTA, window): + inFasta = pysam.FastaFile(FASTA) + # Generate bed file based on all coordenates from reference fasta file + # only allow canonical contigs -- might update to parameters later... + contigs = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y'] + ref_contigs = inFasta.references + if 'chr1' in ref_contigs: + contigs = [f'chr{x}' for x in contigs] + if (CONTIG != 'all'): + CONTIG_Names = [contigs[(int(x)-1)] for x in CONTIG] + else: + CONTIG_Names = contigs + # print(CONTIG_Names) + LIST = [ (x, 1, inFasta.get_reference_length(x)) for x in CONTIG_Names] + # print(LIST) + a = pybedtools.BedTool(LIST) + # final bed to be used for allele counting with windows + final_bed = a.window_maker(a,w=window) + inFasta.close() + return(final_bed) - list_out.append(out) - except: - continue - return(list_out) +def BaseCount(LIST): + Bases=['A','C','G','T','N'] + # Dictinary with base counts + NUCLEOTIDES = { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'N': 0} + # Update dictionary counts + for x in LIST: + if x.upper() in Bases: + NUCLEOTIDES[x.upper()] += 1 + return (NUCLEOTIDES) +def run_interval(interval, BAM, FASTA, MQ, MIN_COV, tmp_dir): + # Coordinates to analyse + CHROM = interval[0] + START = int(interval[1]) + END = int(interval[2]) + # Get pileup read counts from coordinates + bam = pysam.AlignmentFile(BAM) + i = bam.pileup(CHROM, START, END, min_mapping_quality = MQ, ignore_overlaps = False) + # Load reference file. Mandatory to be done inside function to avoid overlap problems during multiprocessing + inFasta = pysam.FastaFile(FASTA) + # Run it for each position in pileup + POSITIONS = [] + for p in i: + POS=p.pos + if POS >= START and POS < END: + # Get reference base from fasta file + ref_base = inFasta.fetch(CHROM, POS, POS+1).upper() + # Get coverage + DP = p.get_num_aligned() + # Run only if coverage is more than minimum (arg parameter) + if (DP >= MIN_COV and ref_base != 'N'): + # Get pileup list + PILEUP_LIST = p.get_query_sequences(mark_matches=True, add_indels=True) + BASE_COUNTS = BaseCount(PILEUP_LIST) + # only include output if ALT != REF + total_count = sum(BASE_COUNTS.values()) + if BASE_COUNTS[ref_base] + BASE_COUNTS['N'] < total_count: + # Pysam provides a 0-based coordinate. We must sum 1 to this value + POS_print = POS + 1 + # Lines to print + LINE_0 = '\t'.join([str(BASE_COUNTS['A']), str(BASE_COUNTS['C']), str(BASE_COUNTS['G']), str(BASE_COUNTS['T']), str(BASE_COUNTS['N'])]) + LINE_1 = '\t'.join([str(CHROM), str(POS_print), str(POS_print), str(ref_base), LINE_0 , str(total_count)]) + # Save results in list of positions + POSITIONS.append(LINE_1) + inFasta.close() + bam.close() + # Return list of positions + ID = '__'.join([str(CHROM), str(START), str(END)]) + out_temp = tmp_dir + '/' + ID + '.hetsnps_allelecounts.temp' + return([out_temp,'\n'.join(POSITIONS)]) #---- -# 3. Prepare input and run allele counter across heterozygous SNPs +# Prepare input and run allele counter across heterozygous SNPs #---- -def perform_allele_counting(outdir, sample, phased_vcf, tumour, allele_mapq, allele_min_reads, threads): - """ extract heterozygous SNPs """ +def perform_allele_counting(outdir, sample, contigs, fasta_file_path, phased_vcf, tumour, ac_window, allele_mapq, allele_min_reads, tmp_dir, threads): + """ extract allele counts for heterozygous SNPs """ # check and define threads new_threads = min(threads, cpu_count()) print(f"... Allele counter will use threads = {new_threads}. (threads = {threads} defined; threads = {cpu_count()} available) ...") threads = new_threads - hets_bed_path = f"{outdir}/{sample}_phased_het_snps.bed" - outfile = open(hets_bed_path, "w") - het_snps = extract_hets(phased_vcf) - for r in het_snps: - Line = '\t'.join(r) + '\n' - outfile.write(Line) - outfile.close() - - ## RUN IN CHUNKS TO SPEED UP - # Define bed_file chunks - bed_file = pybedtools.BedTool(hets_bed_path) - bed_length = sum(1 for _ in bed_file) - chunk_size = bed_length // threads + (bed_length % threads > 0) - print(f" splitting bed file into n = {math.ceil(bed_length / chunk_size)} chunks ...") - # Split the BED file into chunks - chunks = chunkify_bed(bed_file, chunk_size) - - # only use multiprocessing if more than 1 thread available/being used. - if threads == 1: - # loop through chromosomes - print("multithreading skipped.") - allele_counts = [] - for idx,bed_chunk in enumerate(chunks): - curr_chunk = f"chunk {idx+1}" - allele_counts_chunk = allele_count(curr_chunk, bed_chunk, tumour, allele_mapq, allele_min_reads) - allele_counts.append(allele_counts_chunk) - allele_counts = [x for xs in allele_counts for x in xs] - + #---- + # 1. Create bed file and windows for allele counting + #---- + print(f" ... splitting helper bed into windows for allele counting ...") + bed = MakeWindows(contigs, fasta_file_path, ac_window) + #---- + # 2. Multiprocess and run bed bins in parallel + #---- + print(f" ... Allele counting ...") + if (threads > 1): + pool = Pool(threads) + # Use loop to parallelize + for row in bed: + # This funtion writes out temp files + pool.apply_async(run_interval, args=(row, tumour, fasta_file_path, allele_mapq, allele_min_reads, tmp_dir), callback=collect_result) + # Close Pool and let all the processes complete + pool.close() + pool.join() + print(" ... interim finished...") else: - print(f"multithreading using {threads} threads.") - args_in = [[str(f"chunk {idx+1}"),bed_chunk,tumour,allele_mapq,allele_min_reads] for idx,bed_chunk in enumerate(chunks)] - with Pool(processes=threads) as pool: - allele_counts = [x for xs in list(pool.starmap(allele_count, args_in)) for x in xs] - + for row in bed: + # This funtion writes in temp files the results + collect_result(run_interval(row, tumour, fasta_file_path, allele_mapq, allele_min_reads, tmp_dir)) + #---- + # 3. Get interim results and write out + #---- + # print(f" ... Concatenating temp files into interim allele count results ... ") + out_path_interim = f"{outdir}/{sample}_allele_counts_hetSNPs_INTERIM.bed" + concatenate_sort_temp_files_and_write(out_path_interim, tmp_dir) + #---- + # 4. Generate dictionary of heterozygous SNPs from normal VCF + #---- + # print(f" ... Extracting heterozygous SNPs from {phased_vcf} ... ") + het_snps = extract_hets(phased_vcf, ac_window) + # print(f" ... Heterozygous SNPs from {phased_vcf} extracted. Extracting allele counts for heterozygous SNPs ...") + #---- + # 5. Extract allele counts for hetSNPs + #---- + allele_counts_final = process_allele_counts(out_path_interim, het_snps, ac_window) #---- - # 4. Get results and write out + # 6. Get results and write out #---- out_path = f"{outdir}/{sample}_allele_counts_hetSNPs.bed" outfile = open(out_path, "w") - for r in allele_counts: - Line = '\t'.join(r) + '\n' + for row in allele_counts_final: + Line = '\t'.join(row) + '\n' outfile.write(Line) outfile.close() + # remove interim result bed file + os.remove(out_path_interim) return out_path diff --git a/savana/breakpoints.py b/savana/breakpoints.py index fadb92c..6775f4b 100755 --- a/savana/breakpoints.py +++ b/savana/breakpoints.py @@ -472,16 +472,16 @@ def get_potential_breakpoints(aln_filename, is_cram, ref, length, mapq, label, c try: # record start/end in read incrementer start_bin = floor((read.reference_start)/coverage_binsize) - end_bin = floor((read.reference_end+1)/coverage_binsize) + end_bin = floor((read.reference_end)/coverage_binsize) # swap if we need to start_bin, end_bin = (end_bin, start_bin) if start_bin > end_bin else (start_bin, end_bin) - for i in range(start_bin, end_bin+1): + for i in range(start_bin, end_bin): contig_coverage_array[haplotype][i] += 1 except IndexError as _: print(f'Unable to update coverage for contig {contig}') - print(f'Attempting to update bins {floor((read.reference_start-1)/coverage_binsize)} and {floor((read.reference_end-1)/coverage_binsize)}') - print(f'Length of array {len(contig_coverage_array)}, Read {read.reference_start} to {read.reference_end}') + print(f'Attempting to update bins {start_bin} and {end_bin}') + print(f'Length of array {len(contig_coverage_array[haplotype])}, Read {read.reference_start} to {read.reference_end}') if read.mapping_quality < mapq: continue # discard if mapping quality lower than threshold diff --git a/savana/cn_functions.py b/savana/cn_functions.py index 4e652ce..8da4845 100644 --- a/savana/cn_functions.py +++ b/savana/cn_functions.py @@ -258,7 +258,7 @@ def reduce_grid(fits,distance_filter_scale_factor = 1.25): return reduced_grid -def is_acceptable_fit(purity, ploidy, relative_CN, weights, max_proportion_zero = 0.1, min_proportion_close_to_whole_number = 0.5, max_distance_from_whole_number = 0.25): +def is_acceptable_fit(purity, ploidy, relative_CN, weights, max_proportion_zero = 0.1, min_proportion_close_to_whole_number = 0.5, max_distance_from_whole_number = 0.25, main_cn_step_change = 1): ''' Test if given fit is acceptable using parameters looking at the proportion of zero or negative segments, proportion of genome fitted close to an integer, and distance between most common CN state peaks. ''' @@ -282,14 +282,14 @@ def is_acceptable_fit(purity, ploidy, relative_CN, weights, max_proportion_zero # Remove fits which result in overstretchin/overfitting of copy number states (i.e. copy number state skipping) normally caused by oversegmentation most_common = statistics.mode(acn_int) second_most_common = statistics.mode([x for x in acn_int if x != most_common]) - if abs(most_common - second_most_common) >= 2: + if abs(most_common - second_most_common) > main_cn_step_change: print(f"Fit of purity={purity} and ploidy={ploidy} is NOT an acceptable solution, as main CN change step = {abs(most_common - second_most_common)}." ) return False else: return True -def viable_solutions(fits_r, relative_CN, weights, max_proportion_zero = 0.1, min_proportion_close_to_whole_number = 0.5, max_distance_from_whole_number = 0.25): +def viable_solutions(fits_r, relative_CN, weights, max_proportion_zero = 0.1, min_proportion_close_to_whole_number = 0.5, max_distance_from_whole_number = 0.25, main_cn_step_change = 1): ''' Return acceptable solution candidates. ''' @@ -299,7 +299,7 @@ def viable_solutions(fits_r, relative_CN, weights, max_proportion_zero = 0.1, mi if is_acceptable_fit(purity, ploidy, relative_CN, weights, max_proportion_zero = max_proportion_zero, min_proportion_close_to_whole_number = min_proportion_close_to_whole_number, - max_distance_from_whole_number = max_distance_from_whole_number) == True: + max_distance_from_whole_number = max_distance_from_whole_number, main_cn_step_change = main_cn_step_change) == True: solutions.append([sol[0],sol[1],sol[-1]]) # sort solutions by distance function return solutions @@ -329,9 +329,10 @@ def relative_to_absolute_minor_total_CN(chrom, rel_copy_number_segments, allele_ if len(afs) < 1: print(f' BAF and minor allele copy number cannot be estimated for segment {sid}... Total absolute copy number estimated only.') minorCN = '' + baf_mean = '' totalCN = round(relative_to_absolute_CN(rcn, fitted_purity, fitted_ploidy),4) x[-1] = totalCN if totalCN > 0 else 0 - x.append(minorCN) + # x.append(minorCN) elif len(afs) >= 1: baf_mean = 0.5 + statistics.mean(afs) CN_a = (fitted_purity - 1 + (rcn*(1-baf_mean)*(2*(1-fitted_purity)+fitted_purity*fitted_ploidy))) / fitted_purity @@ -339,7 +340,7 @@ def relative_to_absolute_minor_total_CN(chrom, rel_copy_number_segments, allele_ minorCN = round(min(CN_a,CN_b),4) if round(min(CN_a,CN_b),4) > 0 else 0 totalCN = round((CN_a + CN_b),4) if round((CN_a + CN_b),4) > 0 else 0 x[-1] = totalCN - x.append(minorCN) + x.extend((minorCN,baf_mean,len(afs))) acn_minor_major.append(x) return acn_minor_major diff --git a/savana/fit_absolute.py b/savana/fit_absolute.py index 8aeac4a..d1ff45c 100644 --- a/savana/fit_absolute.py +++ b/savana/fit_absolute.py @@ -10,11 +10,7 @@ import numpy as np from scipy.stats import gaussian_kde import statistics -import math import copy -import argparse -import timeit -import os import sys @@ -91,6 +87,7 @@ def estimate_cellularity(phasesets_dict, ps_summary): def process_log2r_input(log2r_cn_path): rel_copy_number = [] with open(log2r_cn_path, "r") as file: + next(file) for line in file: fields = line.strip().split("\t") chrom,start,end,seg_id = fields[1],int(fields[2]),int(fields[3]),fields[-2] @@ -118,9 +115,9 @@ def process_log2r_input(log2r_cn_path): return rel_copy_number_segments def fit_absolute_cn(outdir, log2r_cn_path, allele_counts_bed_path, sample, - min_ploidy, max_ploidy, ploidy_step, min_cellularity, max_cellularity, cellularity_step, cellularity_buffer, + min_ploidy, max_ploidy, ploidy_step, min_cellularity, max_cellularity, cellularity_step, cellularity_buffer, overrule_cellularity, distance_function, distance_filter_scale_factor, distance_precision, - max_proportion_zero, min_proportion_close_to_whole_number, max_distance_from_whole_number, + max_proportion_zero, min_proportion_close_to_whole_number, max_distance_from_whole_number, main_cn_step_change, min_ps_size, min_ps_length, threads): ''' # 3. Estimate purity using phased heterozygous SNPs @@ -148,6 +145,9 @@ def fit_absolute_cn(outdir, log2r_cn_path, allele_counts_bed_path, sample, cellularity = estimate_cellularity(phasesets_dict, ps_summary) digs = len(str(cellularity_step))-2 if isinstance(cellularity_step,int) != True else 1 print(f" estimated cellularity using hetSNPs = {round(cellularity,digs)}.") + if overrule_cellularity != None: + cellularity = int(overrule_cellularity) + print(f" cellularity overruled by user with cellularity = {cellularity}.") min_cellularity = round(max(0,cellularity - cellularity_buffer),digs) max_cellularity = round(min(1,cellularity + cellularity_buffer),digs) @@ -168,7 +168,7 @@ def fit_absolute_cn(outdir, log2r_cn_path, allele_counts_bed_path, sample, solutions = cnfitter.viable_solutions(fits_r, relative_CN, weights, max_proportion_zero = max_proportion_zero, min_proportion_close_to_whole_number = min_proportion_close_to_whole_number, - max_distance_from_whole_number = max_distance_from_whole_number) + max_distance_from_whole_number = max_distance_from_whole_number, main_cn_step_change = main_cn_step_change) # check if viable solutions were found. If not, terminate script and write out error message and arguments to file for inspection and adjustment if len(solutions) == 0: @@ -252,7 +252,7 @@ def fit_absolute_cn(outdir, log2r_cn_path, allele_counts_bed_path, sample, if allele_counts_bed_path == None: header=['chromosome','start','end','segment_id', 'bin_count', 'sum_of_bin_lengths', 'weight', 'copyNumber'] elif allele_counts_bed_path != None: - header=['chromosome','start','end','segment_id', 'bin_count', 'sum_of_bin_lengths', 'weight', 'copyNumber', 'minorAlleleCopyNumber'] + header=['chromosome','start','end','segment_id', 'bin_count', 'sum_of_bin_lengths', 'weight', 'copyNumber', 'minorAlleleCopyNumber', 'meanBAF', 'no_hetSNPs'] outfile3.write('\t'.join(header)+'\n') for r in abs_copy_number_segments: Line = '\t'.join(str(e) for e in r) + '\n' diff --git a/savana/helper.py b/savana/helper.py index 940edd2..688c082 100755 --- a/savana/helper.py +++ b/savana/helper.py @@ -15,7 +15,7 @@ from datetime import datetime import argparse -__version__ = "1.2.2" +__version__ = "1.2.3" samflag_desc_to_number = { "BAM_CMATCH": 0, # M @@ -336,12 +336,15 @@ def time_function(desc, checkpoints, time_str, final=False): print(formatted_time) return -def check_outdir(args_outdir, illegal=None): +def check_outdir(args_outdir, args_overwrite, illegal=None): # create output dir if it doesn't exist outdir = os.path.join(os.getcwd(), args_outdir) if not os.path.exists(outdir): print(f'Creating directory {outdir} to store results') os.mkdir(outdir) + if args_overwrite: + # don't check for files, overwrite them + return outdir if not illegal: # throw error if ANY files present if os.listdir(outdir): @@ -354,6 +357,35 @@ def check_outdir(args_outdir, illegal=None): return outdir +def check_tmpdir(args_tmpdir, outdir, args_overwrite, illegal=None): + # create output dir if it doesn't exist + print(outdir) + print(args_overwrite) + tmpdir = os.path.join(outdir, args_tmpdir) + if not os.path.exists(tmpdir): + print(f'Creating directory {tmpdir} to store temp files during hetSNP allele counting') + os.mkdir(tmpdir) + if args_overwrite: + # don't check for files, overwrite them + return tmpdir + if not illegal: + # throw error if ANY files present + if os.listdir(tmpdir): + sys.exit(f'Temp directory "{tmpdir}" already exists and contains files. Please remove the files or supply a different directory name.') + else: + # check if illegal files exist in outdir + for f in os.listdir(tmpdir): + if f.endswith(illegal): + sys.exit(f'Temp directory "{tmpdir}" already exists and contains {illegal} files which may be overwritten. Please remove the files or supply a different directory name.') + + return tmpdir + +def clean_tmpdir(args_tmpdir, outdir): + tmpdir = os.path.join(outdir, args_tmpdir) + # remove tmpdir if empty + if not os.listdir(tmpdir): + os.rmdir(tmpdir) + # credit to stackoverflow: https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse def float_range(mini,maxi): """Return function handle of an argument type function for ArgumentParser checking a float range: mini <= arg <= maxi diff --git a/savana/read_counter.py b/savana/read_counter.py index 35cea14..2e31bcd 100644 --- a/savana/read_counter.py +++ b/savana/read_counter.py @@ -31,9 +31,6 @@ def count_reads_in_curr_bin(bam, chrom, start, end, readcount_mapq): def binned_read_counting(curr_chunk, bed_chunk, alns, nmode, blacklisting, bl_threshold, bases_filter, bases_threshold, readcount_mapq): print(f" Read counting {curr_chunk} ...") - # bam_T = alns['tumour'] - # bam_N = alns['normal'] if nmode == "mnorm" and len(alns) == 2 else None - # Open the BAM file using pysam bam_T = pysam.AlignmentFile(alns['tumour'], "rb") bam_N = pysam.AlignmentFile(alns['normal'], "rb") if nmode == "mnorm" and len(alns) == 2 else None @@ -75,7 +72,10 @@ def binned_read_counting(curr_chunk, bed_chunk, alns, nmode, blacklisting, bl_th # chr_read_counts.append([bin_name, chrom, str(start), str(end), str(bases), str(use), str(chunk_read_count_T), str(chunk_read_count_N)]) # elif nmode != "mnorm" and len(aln_files) == 1: # chr_read_counts.append([bin_name, chrom, str(start), str(end), str(bases), str(use), str(chunk_read_count_T)]) - chr_read_counts.append([bin_name, chrom, str(start), str(end), str(bases), str(use), str(chunk_read_count_T), str(chunk_read_count_N)]) + if blacklisting == True: + chr_read_counts.append([bin_name, chrom, str(start), str(end), str(bases), str(blacklist), str(use), str(chunk_read_count_T), str(chunk_read_count_N)]) + else: + chr_read_counts.append([bin_name, chrom, str(start), str(end), str(bases), str(use), str(chunk_read_count_T), str(chunk_read_count_N)]) # Close BAM file and return out bam_T.close() @@ -94,8 +94,8 @@ def filter_and_normalise(nmode, countData): for r in normalised_counts: r[-2] = str(math.log2(int(r[-2])/med_self)) # median normalise readcounts del r[-1] - elif nmode == "pon": - print("This function is yet to be implemented...") + # elif nmode == "pon": + # print("This function is yet to be implemented...") elif nmode == "mnorm": filtered_counts = [x for x in countData if x[-3] == 'True' and int(x[-2]) != 0 and int(x[-1]) != 0] cov_scaler = statistics.median([math.log2(int(x[-2])/int(x[-1])) for x in filtered_counts]) @@ -122,7 +122,7 @@ def chunkify_bed(bed_file, chunk_size): chunks.append(pybedtools.BedTool(current_chunk)) return chunks -def count_reads(outdir, tumour, normal, panel_of_normals, sample, bin_annotations_path, readcount_mapq, blacklisting, bl_threshold, bases_filter, bases_threshold, threads): +def count_reads(outdir, tumour, normal, sample, bin_annotations_path, readcount_mapq, blacklisting, bl_threshold, bases_filter, bases_threshold, threads): """ Perform binned read counting on bam file/files per chromosome """ # check and define threads @@ -136,12 +136,12 @@ def count_reads(outdir, tumour, normal, panel_of_normals, sample, bin_annotation 'tumour': tumour, 'normal': normal } - elif panel_of_normals is not None: - nmode = "pon" #panel of normals will be used for logR normalisation - aln_files = { - 'tumour': tumour - } - elif normal is None and panel_of_normals is None: + # elif panel_of_normals is not None: + # nmode = "pon" #panel of normals will be used for logR normalisation + # aln_files = { + # 'tumour': tumour + # } + elif normal is None: #and panel_of_normals is None: nmode = "self" aln_files = { 'tumour': tumour @@ -180,20 +180,32 @@ def count_reads(outdir, tumour, normal, panel_of_normals, sample, bin_annotation #---- # 4. Get results and write out #---- - outfile = open(f"{outdir}/{sample}_read_counts.tsv", "w") + outfile = open(f"{outdir}/{sample}_raw_read_counts.tsv", "w") + if blacklisting == True: + header=['bin', 'chromosome','start','end','known_bases', 'overlap_blacklist', 'use_bin', 'tumour_read_count', 'normal_read_count'] + else: + header=['bin', 'chromosome','start','end','perc_known_bases', 'use_bin', 'tumour_read_count', 'normal_read_count'] + outfile.write('\t'.join(header)+'\n') for r in countData: Line = '\t'.join(r) + '\n' outfile.write(Line) outfile.close() - outfile2 = open(f"{outdir}/{sample}_read_counts_filtered.tsv", "w") - for r in filtered_counts: - Line = '\t'.join(r) + '\n' - outfile2.write(Line) - outfile2.close() + + + # outfile2 = open(f"{outdir}/{sample}_read_counts_filtered.tsv", "w") + # for r in filtered_counts: + # Line = '\t'.join(r) + '\n' + # outfile2.write(Line) + # outfile2.close() log2_ratio_readcounts_path = f"{outdir}/{sample}_read_counts_{nmode}_log2r.tsv" outfile3 = open(log2_ratio_readcounts_path, "w") + if blacklisting == True: + header=['bin', 'chromosome','start','end','known_bases', 'overlap_blacklist', 'use_bin', 'log2r_copynumber'] + else: + header=['bin', 'chromosome','start','end','perc_known_bases', 'use_bin', 'log2r_copynumber'] + outfile3.write('\t'.join(header)+'\n') for r in normalised_counts: Line = '\t'.join(r) + '\n' outfile3.write(Line) diff --git a/savana/savana.py b/savana/savana.py index 851008c..bac0142 100755 --- a/savana/savana.py +++ b/savana/savana.py @@ -41,10 +41,12 @@ def savana_run(args): # set sample name to default if req. args.sample = os.path.splitext(os.path.basename(args.tumour))[0] print(f'Running as sample {args.sample}') - outdir = helper.check_outdir(args.outdir, illegal='.vcf') + outdir = helper.check_outdir(args.outdir, args.overwrite, illegal='.vcf') # set number of threads to cpu count if none set if not args.threads: args.threads = cpu_count() + if not args.cna_threads: + args.cna_threads = cpu_count() # check if files are bam or cram (must have indices) if args.tumour.endswith('bam') and args.normal.endswith('bam'): args.is_cram = False @@ -162,7 +164,7 @@ def savana_evaluate(args): def savana_train(args): """ main function for savana train """ - outdir = helper.check_outdir(args.outdir, illegal='.pkl') + outdir = helper.check_outdir(args.outdir, args.overwrite, illegal='.pkl') data_matrix = None if args.vcfs: # read in and create matrix from VCF files @@ -182,41 +184,46 @@ def savana_cna(args, as_workflow=False): if not args.sample: # set sample name to default args.sample = os.path.splitext(os.path.basename(args.tumour))[0] - outdir = helper.check_outdir(args.outdir) + outdir = helper.check_outdir(args.outdir, args.overwrite) else: outdir = args.outdir + # define tmpdir + tmpdir = helper.check_tmpdir(args.tmpdir, outdir, args.overwrite) # initialize timing checkpoints = [time()] time_str = [] # cna threads - if args.cna_threads: - args.threads = args.cna_threads + if not args.cna_threads: + args.cna_threads = cpu_count() + # if args.cna_threads: + # args.threads = args.cna_threads # first do allele counting if not args.allele_counts_het_snps: - allele_counts_bed_path = allele_counter.perform_allele_counting(outdir, args.sample, args.phased_vcf, args.tumour, args.allele_mapq, args.allele_min_reads, args.threads) + allele_counts_bed_path = allele_counter.perform_allele_counting(outdir, args.sample, args.chromosomes, args.ref, args.phased_vcf, args.tumour, args.ac_window, args.allele_mapq, args.allele_min_reads, tmpdir, args.cna_threads) helper.time_function("Counted alleles of phased SNPs", checkpoints, time_str) else: allele_counts_bed_path = args.allele_counts_het_snps # now generate bins - bin_annotations_path = bin_generator.generate_bins(outdir, args.sample, args.ref, args.chromosomes, args.cn_binsize, args.blacklist, args.breakpoints, args.threads) + bin_annotations_path = bin_generator.generate_bins(outdir, args.sample, args.ref, args.chromosomes, args.cn_binsize, args.blacklist, args.breakpoints, args.cna_threads) helper.time_function("Binned reference genome", checkpoints, time_str) # perform the read counting - read_counts_path = read_counter.count_reads(outdir, args.tumour, args.normal, args.panel_of_normals, args.sample, bin_annotations_path, args.readcount_mapq, args.blacklisting, args.bl_threshold, args.bases_filter, args.bases_threshold, args.threads) + read_counts_path = read_counter.count_reads(outdir, args.tumour, args.normal, args.sample, bin_annotations_path, args.readcount_mapq, args.blacklisting, args.bl_threshold, args.bases_filter, args.bases_threshold, args.cna_threads) helper.time_function("Performed read counting", checkpoints, time_str) # smooth the copy number data smoothened_cn_path = smooth.smooth_copy_number(outdir, read_counts_path, args.smoothing_level, args.trim) helper.time_function("Performed smoothing", checkpoints, time_str) # segment the copy number data - log2r_cn_path = segment.segment_copy_number(outdir, smoothened_cn_path, args.min_segment_size, args.shuffles, args.p_seg, args.p_val, args.quantile, args.threads) + log2r_cn_path = segment.segment_copy_number(outdir, smoothened_cn_path, args.min_segment_size, args.shuffles, args.p_seg, args.p_val, args.quantile, args.cna_threads) helper.time_function("Performed CBS", checkpoints, time_str) # fit absolute copy number fit_absolute.fit_absolute_cn(outdir, log2r_cn_path, allele_counts_bed_path, args.sample, - args.min_ploidy, args.max_ploidy, args.ploidy_step, args.min_cellularity, args.max_cellularity, args.cellularity_step, args.cellularity_buffer, + args.min_ploidy, args.max_ploidy, args.ploidy_step, args.min_cellularity, args.max_cellularity, args.cellularity_step, args.cellularity_buffer, args.overrule_cellularity, args.distance_function, args.distance_filter_scale_factor, args.distance_precision, - args.max_proportion_zero, args.min_proportion_close_to_whole_number, args.max_distance_from_whole_number, - args.min_ps_size, args.min_ps_length, args.threads) + args.max_proportion_zero, args.min_proportion_close_to_whole_number, args.max_distance_from_whole_number, args.main_cn_step_change, + args.min_ps_size, args.min_ps_length, args.cna_threads) helper.time_function("Fit absolute copy number", checkpoints, time_str) - + # cleanup tmpdir + helper.clean_tmpdir(args.tmpdir, outdir) helper.time_function("Total time to perform copy number calling", checkpoints, time_str, final=True) def savana_main(args): @@ -278,6 +285,7 @@ def parse_args(args): run_parser.add_argument('--end_buffer', nargs='?', type=int, default=50, help='Buffer when clustering alternate edge of potential breakpoints, excepting insertions (default=50)') run_parser.add_argument('--threads', nargs='?', type=int, const=0, help='Number of threads to use (default=max)') run_parser.add_argument('--outdir', nargs='?', required=True, help='Output directory (can exist but must be empty)') + run_parser.add_argument('--overwrite', action='store_true', help='Use this flag to write to output directory even if files are present') run_parser.add_argument('-s','--sample', nargs='?', type=str, help="Name to prepend to output files (default=tumour BAM filename without extension)") run_parser.add_argument('--single_bnd', action='store_true', help='Report single breakend variants in addition to standard types (default=False)') run_parser.add_argument('--single_bnd_min_length', nargs='?', type=int, default=1000, help='Minimum length of single breakend to consider (default=100)') @@ -337,29 +345,31 @@ def parse_args(args): train_parser.add_argument('--test_split', nargs='?', type=float, default=0.2, help='Fraction of data to use for test (default=0.2)') train_parser.add_argument('--germline_class', action='store_true', help='Train the model to predict germline and somatic variants (GERMLINE label must be present)') train_parser.add_argument('--outdir', nargs='?', required=True, help='Output directory (can exist but must be empty)') + train_parser.add_argument('--overwrite', action='store_true', help='Use this flag to write to output directory even if files are present') train_parser.add_argument('--threads', nargs='?', type=int, default=16, const=0, help='Number of threads to use') train_parser.set_defaults(func=savana_train) # savana cna cna_parser = subparsers.add_parser("cna", help="run copy number") cna_parser.add_argument('-t','--tumour', nargs='?', type=str, required=True, help='Tumour BAM file (must have index)') - group = cna_parser.add_mutually_exclusive_group() - group.add_argument('-n', '--normal', nargs='?', type=str, required=False, help='Normal BAM file (must have index)') - group.add_argument('-pon', '--panel_of_normals', nargs='?', type=str, required=False, help='Path to panel-of-normals (PoN) file') + cna_parser.add_argument('-n', '--normal', nargs='?', type=str, required=False, help='Normal BAM file (must have index)') cna_parser.add_argument('--ref', nargs='?', type=str, required=True, help='Full path to reference genome') cna_parser.add_argument('--sample', nargs='?', type=str, help="Name to prepend to output files (default=tumour BAM filename without extension)") - cna_parser.add_argument('--threads', type=int, default=48, help='number of threads to be used for multiprocessing of chromosomes. Use threads = 1 to avoid multiprocessing.', required=False) + cna_parser.add_argument('--cna_threads', nargs='?', type=int, const=0, help='Number of threads to use for CNA (default=max)') cna_parser.add_argument('--outdir', nargs='?', required=True, help='Output directory (can exist but must be empty)') + cna_parser.add_argument('--overwrite', action='store_true', help='Use this flag to write to output directory even if files are present') + cna_parser.add_argument('--tmpdir', nargs='?', required=False, default='tmp', help='Temp directory for allele counting temp files (defaults to outdir)') allele_group = cna_parser.add_mutually_exclusive_group() allele_group.add_argument('-v', '--phased_vcf', type=str, help='Path to phased vcf file to extract heterozygous SNPs for allele counting.', required=False) allele_group.add_argument('-ac', '--allele_counts_het_snps', type=str, help='Path to allele counts of heterozygous SNPs', required=False) - cna_parser.add_argument('-q', '--allele_mapq', type=int, default=0, help='Mapping quality threshold for reads to be included in the allele counting (default = 0)', required=False) + cna_parser.add_argument('-q', '--allele_mapq', type=int, default=5, help='Mapping quality threshold for reads to be included in the allele counting (default = 5)', required=False) cna_parser.add_argument('-mr', '--allele_min_reads', type=int, default=10, help='Minimum number of reads required per het SNP site for allele counting (default = 10)', required=False) + cna_parser.add_argument('-acw','--ac_window', type=int, default=1000000, help='Window size for allele counting to parallelise (default = 1000000)', required=False) cna_parser.add_argument('-w', '--cn_binsize', type=int, default=10, help='Bin window size in kbp', required=False) cna_parser.add_argument('-b', '--blacklist', type=str, help='Path to the blacklist file', required=False) cna_parser.add_argument('-bp', '--breakpoints', type=str, help='Path to SAVANA VCF file to incorporate savana breakpoints into copy number analysis', required=False) cna_parser.add_argument('-c', '--chromosomes', nargs='+', default='all', help='Chromosomes to analyse. To run on all chromosomes, leave unspecified (default). To run on a subset of chromosomes only, specify the chromosome numbers separated by spaces. For x and y chromosomes, use 23 and 24, respectively. E.g. use "-c 1 4 23 24" to run chromosomes 1, 4, X and Y', required=False) - cna_parser.add_argument('-rq', '--readcount_mapq', type=int, default=0, help='Mapping quality threshold for reads to be included in the read counting (default = 5)', required=False) + cna_parser.add_argument('-rq', '--readcount_mapq', type=int, default=5, help='Mapping quality threshold for reads to be included in the read counting (default = 5)', required=False) cna_parser.add_argument('--no_blacklist', dest='blacklisting', action='store_false') cna_parser.set_defaults(blacklisting=True) cna_parser.add_argument('-blt', '--bl_threshold', type=int, default='5', help='Percentage overlap between bin and blacklist threshold to tolerate for read counting (default = 5). Please specify percentage threshold as integer, e.g. "-t 5". Set "-t 0" if no overlap with blacklist is to be tolerated', required=False) @@ -380,12 +390,14 @@ def parse_args(args): cna_parser.add_argument('--max_cellularity', type=float, default=1, help='Maximum cellularity to be considered for copy number fitting. If hetSNPs allele counts are provided, this is estimated during copy number fitting. Alternatively, a purity value can be provided if the purity of the sample is already known.', required=False) cna_parser.add_argument('--cellularity_step', type=float, default=0.01, help='Cellularity step size for grid search space used during for copy number fitting.', required=False) cna_parser.add_argument('--cellularity_buffer', type=float, default=0.1, help='Cellularity buffer to define purity grid search space during copy number fitting (default = 0.1).', required=False) + cna_parser.add_argument('--overrule_cellularity', type=float, default=None, help='Set to sample`s purity if known. This value will overrule the cellularity estimated using hetSNP allele counts (not used by default).', required=False) cna_parser.add_argument('--distance_function', type=str, default='RMSD', help='Distance function to be used for copy number fitting.', choices=['RMSD', 'MAD'], required=False) cna_parser.add_argument('--distance_filter_scale_factor', type=float, default=1.25, help='Distance filter scale factor to only include solutions with distances < scale factor * min(distance).', required=False) cna_parser.add_argument('--distance_precision', type=int, default=3, help='Number of digits to round distance functions to', required=False) cna_parser.add_argument('--max_proportion_zero', type=float, default=0.1, help='Maximum proportion of fitted copy numbers to be tolerated in the zero or negative copy number state.', required=False) cna_parser.add_argument('--min_proportion_close_to_whole_number', type=float, default=0.5, help='Minimum proportion of fitted copy numbers sufficiently close to whole number to be tolerated for a given fit.', required=False) cna_parser.add_argument('--max_distance_from_whole_number', type=float, default=0.25, help='Distance from whole number for fitted value to be considered sufficiently close to nearest copy number integer.', required=False) + cna_parser.add_argument('--main_cn_step_change', type=int, default=1, help='Max main copy number step change across genome to be considered for a given solution.', required=False) cna_parser.add_argument('--min_ps_size', type=int, default=10, help='Minimum size (number of SNPs) for phaseset to be considered for purity estimation.', required=False) cna_parser.add_argument('--min_ps_length', type=int, default=500000, help='Minimum length (bps) for phaseset to be considered for purity estimation.', required=False) cna_parser.set_defaults(func=savana_cna) @@ -411,6 +423,7 @@ def parse_args(args): global_parser.add_argument('--end_buffer', nargs='?', type=int, default=50, help='Buffer when clustering alternate edge of potential breakpoints, excepting insertions (default=50)') global_parser.add_argument('--threads', nargs='?', type=int, const=0, help='Number of threads to use (default=max)') global_parser.add_argument('--outdir', nargs='?', required=True, help='Output directory (can exist but must be empty)') + global_parser.add_argument('--overwrite', action='store_true', help='Use this flag to write to output directory even if files are present') global_parser.add_argument('--sample', nargs='?', type=str, help='Name to prepend to output files (default=tumour BAM filename without extension)') global_parser.add_argument('--single_bnd', action='store_true', help='Report single breakend variants in addition to standard types (default=False)') global_parser.add_argument('--single_bnd_min_length', nargs='?', type=int, default=1000, help='Minimum length of single breakend to consider (default=100)') @@ -445,17 +458,18 @@ def parse_args(args): evaluate_group.add_argument('--by_distance', action='store_true', default=True, help='Comparison method: tie-break by min. distance (default)') # cna args global_parser.add_argument('--cna_threads', nargs='?', type=int, const=0, help='Number of threads to use for CNA (default=max)') - global_parser.add_argument('-pon', '--panel_of_normals', nargs='?', type=str, required=False, help='Path to panel-of-normals (PoN) file (only used when no phased VCF provided)') + global_parser.add_argument('--tmpdir', nargs='?', required=False, default='tmp', help='Temp directory for allele counting temp files (defaults to outdir)') allele_group = global_parser.add_mutually_exclusive_group() allele_group.add_argument('-v', '--phased_vcf', type=str, help='Path to phased vcf file to extract heterozygous SNPs for allele counting.', required=False) allele_group.add_argument('-ac', '--allele_counts_het_snps', type=str, help='Path to allele counts of heterozygous SNPs', required=False) - global_parser.add_argument('-q', '--allele_mapq', type=int, default=0, help='Mapping quality threshold for reads to be included in the allele counting (default = 0)', required=False) + global_parser.add_argument('-q', '--allele_mapq', type=int, default=5, help='Mapping quality threshold for reads to be included in the allele counting (default = 5)', required=False) global_parser.add_argument('-mr', '--allele_min_reads', type=int, default=10, help='Minimum number of reads required per het SNP site for allele counting (default = 10)', required=False) + global_parser.add_argument('-acw','--ac_window', type=int, default=1000000, help='Window size for allele counting to parallelise (default = 1000000)', required=False) global_parser.add_argument('-w', '--cn_binsize', type=int, default=10, help='Bin window size in kbp', required=False) global_parser.add_argument('-b', '--blacklist', type=str, help='Path to the blacklist file', required=False) global_parser.add_argument('-bp', '--breakpoints', type=str, help='Path to SAVANA VCF file to incorporate savana breakpoints into copy number analysis', required=False) global_parser.add_argument('-c', '--chromosomes', nargs='+', default='all', help='Chromosomes to analyse. To run on all chromosomes, leave unspecified (default). To run on a subset of chromosomes only, specify the chromosome numbers separated by spaces. For x and y chromosomes, use 23 and 24, respectively. E.g. use "-c 1 4 23 24" to run chromosomes 1, 4, X and Y', required=False) - global_parser.add_argument('-rq', '--readcount_mapq', type=int, default=0, help='Mapping quality threshold for reads to be included in the read counting (default = 5)', required=False) + global_parser.add_argument('-rq', '--readcount_mapq', type=int, default=5, help='Mapping quality threshold for reads to be included in the read counting (default = 5)', required=False) global_parser.add_argument('--no_blacklist', dest='blacklisting', action='store_false') global_parser.set_defaults(blacklisting=True) global_parser.add_argument('-blt', '--bl_threshold', type=int, default='5', help='Percentage overlap between bin and blacklist threshold to tolerate for read counting (default = 0, i.e. no overlap tolerated). Please specify percentage threshold as integer, e.g. "-t 5" ', required=False) @@ -476,12 +490,14 @@ def parse_args(args): global_parser.add_argument('--max_cellularity', type=float, default=1, help='Maximum cellularity to be considered for copy number fitting. If hetSNPs allele counts are provided, this is estimated during copy number fitting. Alternatively, a purity value can be provided if the purity of the sample is already known.', required=False) global_parser.add_argument('--cellularity_step', type=float, default=0.01, help='Cellularity step size for grid search space used during for copy number fitting.', required=False) global_parser.add_argument('--cellularity_buffer', type=float, default=0.1, help='Cellularity buffer to define purity grid search space during copy number fitting (default = 0.1).', required=False) + global_parser.add_argument('--overrule_cellularity', type=float, default=None, help='Set to sample`s purity if known. This value will overrule the cellularity estimated using hetSNP allele counts (not used by default).', required=False) global_parser.add_argument('--distance_function', type=str, default='RMSD', help='Distance function to be used for copy number fitting.', choices=['RMSD', 'MAD'], required=False) global_parser.add_argument('--distance_filter_scale_factor', type=float, default=1.25, help='Distance filter scale factor to only include solutions with distances < scale factor * min(distance).', required=False) global_parser.add_argument('--distance_precision', type=int, default=3, help='Number of digits to round distance functions to', required=False) global_parser.add_argument('--max_proportion_zero', type=float, default=0.1, help='Maximum proportion of fitted copy numbers to be tolerated in the zero or negative copy number state.', required=False) global_parser.add_argument('--min_proportion_close_to_whole_number', type=float, default=0.5, help='Minimum proportion of fitted copy numbers sufficiently close to whole number to be tolerated for a given fit.', required=False) global_parser.add_argument('--max_distance_from_whole_number', type=float, default=0.25, help='Distance from whole number for fitted value to be considered sufficiently close to nearest copy number integer.', required=False) + global_parser.add_argument('--main_cn_step_change', type=int, default=1, help='Max main copy number step change across genome to be considered for a given solution.', required=False) global_parser.add_argument('--min_ps_size', type=int, default=10, help='Minimum size (number of SNPs) for phaseset to be considered for purity estimation.', required=False) global_parser.add_argument('--min_ps_length', type=int, default=500000, help='Minimum length (bps) for phaseset to be considered for purity estimation.', required=False) global_parser.set_defaults(func=savana_main) diff --git a/savana/segment.py b/savana/segment.py index 7396009..f04c4fc 100644 --- a/savana/segment.py +++ b/savana/segment.py @@ -274,6 +274,7 @@ def segment_copy_number(outdir, smoothened_cn_path, min_segment_size, shuffles, in_data = [] with open(smoothened_cn_path, "r") as file: + header_line = next(file) for line in file: fields = line.strip().split("\t") in_data.append(fields) @@ -305,6 +306,8 @@ def segment_copy_number(outdir, smoothened_cn_path, min_segment_size, shuffles, ### WRITE OUT FILE ### segmented_outpath = f"{outdir}/{prefix}_segmented.tsv" outfile = open(segmented_outpath, "w") + header=[header_line.replace('\n',''), 'seg_id', 'seg_log2r_copynumber'] + outfile.write('\t'.join(header)+'\n') for r in segmentedData: Line = '\t'.join(r) + '\n' outfile.write(Line) @@ -354,6 +357,8 @@ def segment_copy_number(outdir, smoothened_cn_path, min_segment_size, shuffles, # # ax.tick_params(axis='x', rotation=45) # ax.get_figure().savefig(f'{outdir}/{prefix}_segmented_CNprofile.png') + os.remove(smoothened_cn_path) + return segmented_outpath if __name__ == '__main__': diff --git a/savana/smooth.py b/savana/smooth.py index 878caf1..4f416fa 100644 --- a/savana/smooth.py +++ b/savana/smooth.py @@ -129,6 +129,7 @@ def smooth_copy_number(outdir, read_counts_path, smoothing_level, trim): # process input data in_data = [] with open(read_counts_path, "r") as file: + header_line=next(file) for line in file: fields = line.strip().split("\t") in_data.append(fields) @@ -147,11 +148,14 @@ def smooth_copy_number(outdir, read_counts_path, smoothing_level, trim): # Get results and write out outfile_path = f"{outdir}/{file_name}_smoothened_sl{smoothing_level}_t{trim}.tsv" outfile = open(outfile_path, "w") + outfile.write(header_line) for r in smoothedData: Line = '\t'.join(r) + '\n' outfile.write(Line) outfile.close() + os.remove(read_counts_path) + return outfile_path if __name__ == "__main__":