diff --git a/workflow/Snakefile b/workflow/Snakefile index afd0195..82803a0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,12 +19,12 @@ report: "report/workflow.rst" ##### General wirldcard constrains ##### wildcard_constraints: - project="[^/\.]+", - assignment="[^/\.]+", - condition="[^/_\.]+", - replicate="[^/_\.]+", - type="(DNA)|(RNA)", - config="[^/\.]+", + project=r"[^/\.]+", + assignment=r"[^/\.]+", + condition=r"[^/_\.]+", + replicate=r"[^/_\.]+", + type=r"(DNA)|(RNA)", + config=r"[^/\.]+", ##### localrules ##### diff --git a/workflow/envs/python3.yaml b/workflow/envs/python3.yaml index 245e62e..1c94f62 100644 --- a/workflow/envs/python3.yaml +++ b/workflow/envs/python3.yaml @@ -12,3 +12,4 @@ dependencies: - pandas - polars - python + - pysam diff --git a/workflow/rules/counts/counts_umi.smk b/workflow/rules/counts/counts_umi.smk index f620545..5a86e6d 100644 --- a/workflow/rules/counts/counts_umi.smk +++ b/workflow/rules/counts/counts_umi.smk @@ -14,8 +14,8 @@ rule counts_umi_create_BAM: fw_fastq=lambda wc: getFW(wc.project, wc.condition, wc.replicate, wc.type), rev_fastq=lambda wc: getRev(wc.project, wc.condition, wc.replicate, wc.type), umi_fastq=lambda wc: getUMI(wc.project, wc.condition, wc.replicate, wc.type), - script_FastQ2doubleIndexBAM=getScript("count/FastQ2doubleIndexBAM.py"), - script_MergeTrimReadsBAM=getScript("count/MergeTrimReadsBAM.py"), + script_FastQ2doubleIndexBAM=getScript("count/FastQ2doubleIndexBAM_python3.py"), + script_MergeTrimReadsBAM=getScript("count/MergeTrimReadsBAM_python3.py"), output: "results/experiments/{project}/counts/useUMI.{condition}_{replicate}_{type}.bam", params: @@ -23,7 +23,7 @@ rule counts_umi_create_BAM: umi_length=lambda wc: config["experiments"][wc.project]["umi_length"], datasetID="{condition}_{replicate}_{type}", conda: - "../../envs/python27.yaml" + "../../envs/python3.yaml" log: temp( "results/logs/counts/umi/create_BAM.{project}.{condition}.{replicate}.{type}.log" diff --git a/workflow/scripts/count/FastQ2doubleIndexBAM_python3.py b/workflow/scripts/count/FastQ2doubleIndexBAM_python3.py new file mode 100755 index 0000000..4eabf93 --- /dev/null +++ b/workflow/scripts/count/FastQ2doubleIndexBAM_python3.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +# -*- coding: ASCII -*- + +""" +Extract the index sequence from the middle and end of an Illumina run. Separates reads for Paired End runs. + +:Author: Martin Kircher +:Contact: mkircher@uw.edu +:Date: *06.12.2019 +:Type: tool +:Input: fasta, fastq +:Output: BAM + + +Python3 Updated version +:Updated By: Ali Hassan +:Contact: alihassan1697@gmail.com +:Date: 16.01.2024 + + +""" + +import sys +import os + +table = str.maketrans('.', 'N') + +import pysam +from collections import defaultdict +from library_python3 import read_fastq +from optparse import OptionParser, OptionGroup + +ireadlength1 = None +ireadlength2 = None + +INF_QUALITY = 200 + + +def read_sequence_file(infile, sec_read_start=None): + global options + global ireadlength1, ireadlength2 + if ireadlength2 != 0: + ireadl2 = -ireadlength2 + else: + ireadl2 = None + # seqid,oindseq1,oindqual1,oindseq2,oindqual2,r1,q1,r2,q2 + if options.nextera and ireadl2 != None: + for seqid, seq, qual in read_fastq(infile): + seq = seq.translate(table) + if qual != None and options.qualityoffset != 33: qual = "".join( + [chr(ord(x) - options.qualityoffset + 33) for x in qual]) + if sec_read_start == None: + if qual != None: + yield seqid, seq[-(ireadlength1 + ireadlength2):-ireadlength2], qual[-( + ireadlength1 + ireadlength2):-ireadlength2], seq[-ireadlength2:], qual[-ireadlength2:], seq[ + :-( + ireadlength1 + ireadlength2)], qual[ + :-( + ireadlength1 + ireadlength2)], None, None + else: + yield seqid, seq[-(ireadlength1 + ireadlength2):-ireadlength2], None, seq[ + -ireadlength2:], None, seq[:-( + ireadlength1 + ireadlength2)], None, None, None + else: + if qual != None: + yield seqid, seq[ + (sec_read_start - ireadlength1 - ireadlength2):(sec_read_start - ireadlength2)], qual[( + sec_read_start - ireadlength1 - ireadlength2):( + sec_read_start - ireadlength2)], seq[ + ( + sec_read_start - ireadlength2):( + sec_read_start)], qual[ + ( + sec_read_start - ireadlength2):( + sec_read_start)], seq[ + :( + sec_read_start - ireadlength1 - ireadlength2)], qual[ + :( + sec_read_start - ireadlength1 - ireadlength2)], seq[ + sec_read_start:], qual[ + sec_read_start:] + else: + yield seqid, seq[(sec_read_start - ireadlength1 - ireadlength2):( + sec_read_start - ireadlength2)], None, seq[(sec_read_start - ireadlength2):( + sec_read_start)], None, seq[:(sec_read_start - ireadlength1 - ireadlength2)], None, seq[ + sec_read_start:], None + + else: + for seqid, seq, qual in read_fastq(infile): + seq = seq.translate(table) + if qual != None and options.qualityoffset != 33: qual = "".join( + [chr(ord(x) - options.qualityoffset + 33) for x in qual]) + if sec_read_start == None: + if qual != None: + yield seqid, seq[-ireadlength1:], qual[-ireadlength1:], "", None, seq[:-ireadlength1], qual[ + :-ireadlength1], None, None + else: + yield seqid, seq[-ireadlength1:], None, "", None, seq[:-ireadlength1], None, None, None + else: + if qual != None: + if ireadl2 == None: + yield seqid, seq[(sec_read_start - ireadlength1):sec_read_start], qual[( + sec_read_start - ireadlength1):sec_read_start], "", None, seq[ + :( + sec_read_start - ireadlength1)], qual[ + :( + sec_read_start - ireadlength1)], seq[ + sec_read_start:], qual[ + sec_read_start:] + else: + yield seqid, seq[(sec_read_start - ireadlength1):sec_read_start], qual[( + sec_read_start - ireadlength1):sec_read_start], seq[ + ireadl2:], qual[ + ireadl2:], seq[ + :( + sec_read_start - ireadlength1)], qual[ + :( + sec_read_start - ireadlength1)], seq[ + sec_read_start:ireadl2], qual[ + sec_read_start:ireadl2] + else: + if ireadl2 == None: + yield seqid, seq[(sec_read_start - ireadlength1):sec_read_start], None, "", None, seq[:( + sec_read_start - ireadlength1)], None, seq[sec_read_start:], None + else: + yield seqid, seq[(sec_read_start - ireadlength1):sec_read_start], None, seq[ + ireadl2:], None, seq[:( + sec_read_start - ireadlength1)], None, seq[sec_read_start:ireadl2], None + + # raise StopIteration + + +parser = OptionParser("%prog [options] seq_files") +parser.add_option("-p", "--PIPE", dest="pipe", help="Read from and write to PIPE", default=False, action="store_true") +parser.add_option("-s", "--start", dest="start", help="First base of the second read (default None)", type='int') +parser.add_option("-l", "--ireadlength", dest="ireadlength1", + help="Length of index read (in case it is longer than the indexes in the index file provided)", + type="int", default=0) +parser.add_option("-m", "--2nd_ireadlength", dest="ireadlength2", + help="Length of a second index read (in case it is longer than the indexes in the index file provided)", + type="int", default=0) +parser.add_option("--qualityoffset", dest="qualityoffset", + help="Offset of quality scores in FastQ input file (default 33)", type="int", default=33) +parser.add_option("--nextera", dest="nextera", help="Order of reads like Nextera", default=False, action="store_true") +parser.add_option("-r", "--RG", dest="readgroup", help="Set read group for all reads", default="") + +group = OptionGroup(parser, "Quality filtering of index read") +group.add_option("-q", "--quality", dest="quality", + help="Minimum quality score of bases in both indexes (default 0:off)", type='int', default=0) +group.add_option("--qualityN", dest="qualityN", + help="Consider subset of bases for quality score filter (e.g. NyyyyyN removes first and last base)", + default="") +group.add_option("--2nd_qualityN", dest="qualityN_2nd", + help="Consider subset of bases for quality score filter (e.g. NyyyyyN removes first and last base)", + default="") +parser.add_option_group(group) + +group = OptionGroup(parser, "Output options") +group.add_option("-o", "--outdir", dest="outdir", help="Create output files in another directory.") +group.add_option("", "--outprefix", dest="outprefix", help="Prefix for output files (default 'demultiplex').", + default="demultiplex") +group.add_option("", "--SAM", dest="SAM", help="Output SAM not BAM.", default=False, action="store_true") +group.add_option("--summary", dest="summary", help="Show summary for reads writen to separate files", default=False, + action="store_true") +group.add_option("-v", "--verbose", dest="verbose", help="Turn all messages on", default=False, action="store_true") +parser.add_option_group(group) + +(options, args) = parser.parse_args() + +if options.outprefix == "": + sys.stderr.write("Outprefix can not be empty!\n") + sys.exit() + +if options.outdir != None and not os.path.isdir(options.outdir): + sys.stderr.write("Output folder does not exist!\n") + sys.exit() +elif options.outdir == None: + options.outdir = "" +else: + options.outdir = options.outdir.rstrip('/') + '/' + +if ((options.start != None) and (options.start > 0)): + options.start -= 1 +else: + options.start = None + +ireadlength1 = options.ireadlength1 +ireadlength2 = options.ireadlength2 + +options.qualityN = options.qualityN.upper() +options.qualityN_2nd = options.qualityN_2nd.upper() + +# sys.stderr.write("%d %d %d %s %s\n"%(ireadlength1,ireadlength2,options.start,isdoubleIndex,options.index)) + +## CREATE OUTPUT FILE(S)/STREAM +fileflags = 'wb' +if options.SAM: fileflags = 'w' +SAMheader = {'HD': {'VN': '1.4', 'SO': 'queryname'}, 'SQ': [{'LN': 0, 'SN': '*'}]} +if options.readgroup != "": + SAMheader['RG'] = [{'ID': options.readgroup, "PL": "Illumina", "LB": options.readgroup, "SM": options.readgroup}] + +if options.verbose: sys.stderr.write("Creating output file/stream...\n") +outfiles = {} + +if options.pipe: + if options.verbose: sys.stderr.write("BAM/SAM output on stdout...\n") + outfiles[None] = [pysam.Samfile("-", fileflags, header=SAMheader), 0, None] +else: + outfilename = options.outdir + options.outprefix + ".bam" + if options.verbose: sys.stderr.write("Creating: %s\n" % outfilename) + outfiles[None] = [pysam.Samfile(outfilename, fileflags, header=SAMheader), 0, None] + +files = args +if options.pipe: files = [None] + +for filename in files: + if filename == None: + infile = sys.stdin + else: + infile = open(filename) + + for seqid, oindseq1, oindqual1, oindseq2, oindqual2, r1, q1, r2, q2 in read_sequence_file(infile, + sec_read_start=options.start): + # print seqid,oindseq1,oindqual1,oindseq2,oindqual2,r1,q1,r2,q2 + is_qcfail = False + cind = None + tags = [] + + minqual1 = INF_QUALITY + if options.quality > 0 and ireadlength1 > 0: minqual1 = get_min_qual(oindqual1, options.qualityN) + minqual2 = INF_QUALITY + if options.quality > 0 and ireadlength2 > 0: minqual2 = get_min_qual(oindqual2, options.qualityN_2nd) + if ireadlength2 == 0: indseq2 = None + + if (ireadlength1 > 0) and (ireadlength2 > 0): + if (minqual1 < options.quality) or (minqual2 < options.quality): is_qcfail = True + elif (ireadlength1 > 0): + if (minqual1 < options.quality): is_qcfail = True + elif (ireadlength2 > 0): + if (minqual2 < options.quality): is_qcfail = True + + if r2 != None and len(r2) == 0: + r2 = None + q2 = None + + if options.readgroup != "": tags.append(("RG", options.readgroup)) + + if (len(oindseq1) != 0): + tags.append(("XI", oindseq1)) + tags.append(("YI", oindqual1)) + if (len(oindseq2) != 0): + tags.append(("XJ", oindseq2)) + tags.append(("YJ", oindqual2)) + + if is_qcfail: tags.append(("ZQ", "I")) + forward = pysam.AlignedRead() + forward.qname = seqid + forward.seq = r1 + if q1 != None: + forward.qual = q1 + else: + forward.qual = "*" + forward.is_unmapped = True + forward.is_qcfail = is_qcfail + forward.tags = tags + forward.pos = -1 + forward.mpos = -1 + if r2 != None: + forward.is_read1 = True + forward.is_paired = True + reverse = pysam.AlignedRead() + reverse.qname = seqid + reverse.is_read2 = True + reverse.is_paired = True + reverse.seq = r2 + if q2 != None: + reverse.qual = q2 + else: + reverse.qual = "*" + reverse.pos = -1 + reverse.mpos = -1 + reverse.is_qcfail = is_qcfail + reverse.is_unmapped = True + forward.mate_is_unmapped = True + reverse.mate_is_unmapped = True + reverse.tags = tags + outfiles[None][1] += 1 + outfiles[None][0].write(forward) + outfiles[None][0].write(reverse) + else: + outfiles[None][1] += 1 + outfiles[None][0].write(forward) + + # REMOVE EMPTY FILES AND PRINT SUMMARY + summary = [] + closed = False + for tag, value in outfiles.items(): + if value[2] != None and not closed: + value[0].close() + close = True + if value[1] == 0 and value[2] != None: + try: + if options.verbose: sys.stderr.write("Removing: %s\n" % (value[2])) + os.remove(value[2]) + except: + sys.stderr.write(f"Error: Removing output files {value[2]}\n") + elif options.summary: + sys.stderr.write("Summary: Wrote %10d sequences\n" % value[1]) + + if filename != None: + infile.close() diff --git a/workflow/scripts/count/MergeTrimReadsBAM_python3.py b/workflow/scripts/count/MergeTrimReadsBAM_python3.py new file mode 100644 index 0000000..fd86182 --- /dev/null +++ b/workflow/scripts/count/MergeTrimReadsBAM_python3.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python +# -*- coding: ASCII -*- + +""" + +Merge/Adapter trim reads stored in BAM + +:Author: Martin Kircher +:Contact: martin.kircher@bihealth.de +:Date: *19.04.2018 +:Type: tool +:Input: BAM +:Output: BAM + + +Python3 Updated version +:Updated By: Ali Hassan +:Contact: alihassan1697@gmail.com +:Date: 16.01.2024 + +""" + +import sys, os +import math +import random + +import pysam +from optparse import OptionParser,OptionGroup +import string + +from MergeTrimReads_python3 import set_adapter_sequences, set_options, set_keys, process_SR, process_PE +table = str.maketrans('TGCA','ACGT') # COMPLEMENT DNA + +maxadapter_comp = 30 +min_length = 5 +notified = set() + +def revcompl(seq): + global table + seq=seq.translate(table) + return seq[::-1] + +# ADD FLAG TO QUALITY FILTER REAG TAG +def add_quality_flag(tags,qtype): + qc_tag_helper = [] + foundzq = False + if tags != None: + for tag,value in tags: + if tag == "ZQ": + foundzq = True + qc_tag = list(set(list(value+qtype))) + qc_tag.sort() + qc_tag_helper.append((tag,"".join(qc_tag))) + else: qc_tag_helper.append((tag,value)) + if not foundzq: qc_tag_helper.append(("ZQ",qtype)) + return qc_tag_helper + +# CREATE NEW BAM READ FROM ORIGINAL READS AND MERGED SEQUENCE/QUALITY +def create_new_read(new_seq,new_qual,read1,read2): + global notified + new = pysam.AlignedRead() + if not read1.qname.startswith("M_"): new.qname = "M_"+read1.qname + else: new.qname = read1.qname + new.seq = new_seq + new.qual = new_qual + new.is_unmapped = True + new.pos = -1 + new.mpos = -1 + + new.is_qcfail = read1.is_qcfail and read2.is_qcfail + if read1.tags != None: htags = dict(read1.tags) + else: htags = {} + if (len(new_seq) < min_length): + new.is_qcfail = True + if "ZQ" in htags: htags["ZQ"]+="L" + else: htags["ZQ"]="L" + stags = set() + new_tags = [] + if read2.tags != None: + for tag,value in read2.tags: + stags.add(tag) + if tag == "NM" or tag == "MD": continue + elif tag in htags and value != htags[tag]: # NEW TAG DIFF VALUE + if tag == "ZQ": + qc_tag = list(set(list(value+htags[tag]))) + qc_tag.sort() + new_tags.append((tag,"".join(qc_tag))) + else: + if tag not in notified: + sys.stderr.write("Do not know how to combine %s BAM tags. Information of one of the reads will get lost during merging.\n"%tag) + notified.add(tag) + elif tag in htags and value == htags[tag]: # SAME TAG AND VALUE + new_tags.append((tag,value)) + else: # NEW TAG + new_tags.append((tag,value)) + for tag,value in htags.items(): + if tag not in stags: new_tags.append((tag,value)) + new.tags = new_tags + return new + + +options_adapter_F="AGATCGGAAGAGCACACGTCTGAACTCCAGTCACIIIIIIIATCTCGTATGCCGTCTTCTGCTTG" +options_adapter_S="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTT" +options_adapter_chimera="ACACTCTTTCCCTACACGTCTGAACTCCAG,ACACTCTTTCCCACACGTCTGAACTCCAGT,ACACTCTTTCCCTACACACGTCTGAACTCC,CTCTTTCCCTACACGTCTGAACTCCAGTCA,GAAGAGCACACGTCTGAACTCCAGTCACII,GAGCACACGTCTGAACTCCAGTCACIIIII,GATCGGAAGAGCACACGTCTGAACTCCAGT,AGATCGGAAGAGCACACGTCTGAACTCCAG,AGAGCACACGTCTGAACTCCAGTCACIIII,ACACGTCTGAACTCCAGTCACIIIIIIIAT,GTGCACACGTCTGAACTCCAGTCACIIIII,AGCACACGTCTGAACTCCAGTCACIIIIII,CGTATGCCGTCTTCTGCTTGAAAAAAAAAA" + +parser = OptionParser("%prog [options] BAMfile") +parser.add_option("-p","--PIPE",dest="pipe",help="Read BAM from and write it to PIPE",default=False,action="store_true") +parser.add_option("-o", "--outdir", dest="outdir", help="Create output files in another directory.") +parser.add_option("", "--outprefix", dest="outprefix", help="Prefix for output files (default 'MergeTrim').",default="MergeTrim") +parser.add_option("", "--SAM", dest="SAM", help="Output SAM not BAM.",default=False,action="store_true") +parser.add_option("-v", "--verbose", dest="verbose", help="Turn all messages on",default=False,action="store_true") + +group = OptionGroup(parser, "Paired End merging/Single Read trimming options") +group.add_option("--mergeoverlap",dest="mergeoverlap",help="Merge PE reads of molecules longer than read length that show a minimum overlap (default False)",default=False,action="store_true") +group.add_option("--onlyoverlap",dest="onlyoverlap",help="Only sequence overlapping between two reads (default False)",default=False,action="store_true") +group.add_option("-f", "--adapterFirstRead", dest="adapter_F", help="Adapter that is observed after the forward read (def. Multiplex: %s)"%options_adapter_F[:maxadapter_comp],default=options_adapter_F) +group.add_option("-s", "--adapterSecondRead", dest="adapter_S", help="Adapter that is observed after the reverse read (def. Multiplex: %s)"%options_adapter_S[:maxadapter_comp],default=options_adapter_S) +group.add_option("-c", "--FirstReadChimeraFilter", dest="adapter_chimera", help="If the forward read looks like this sequence, the cluster is filtered out. Provide several sequences separated by comma.(def. Multiplex: %s)"%(",".join([x[:maxadapter_comp] for x in options_adapter_chimera.split(',')])),default=options_adapter_chimera) +group.add_option("-k","--key",dest="key",help="Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (def. '')",default="") +group.add_option("-i","--allowMissing",dest="allowMissing",help="Allow one base in one key to be missing or wrong. (def. False)",default=False,action="store_true") +group.add_option("-t","--trimCutoff",dest="trimCutoff",help="Lowest number of adapter bases to be observed for Single Read trimming (default 1)",default=1,type="int") +group.add_option("--insertOne",dest="insertOne",help="For reads failing regular merging (not overlapping read ends!), check whether they can be merged by inserting 1 base (default off)",default=False,action="store_true") +group.add_option("--noSecondTrim",dest="noSecondTrim",help="Do not trim single end reads starting in T_ (def. False)",default=False,action="store_true") +group.add_option("--minoverlap",dest="minoverlap",help="Minimum overlap of merged PE reads (default 11)",default=11,type="int") +parser.add_option_group(group) +(options, args) = parser.parse_args() + +if options.onlyoverlap: options.mergeoverlap = True + +set_adapter_sequences(options.adapter_F, options.adapter_S, options.adapter_chimera, maxadapter_comp) +set_options(options.trimCutoff, options.allowMissing, options.mergeoverlap, options.onlyoverlap, options.minoverlap-1) +if not set_keys(options.key): + sys.exit() + +if options.outprefix == "": + sys.stderr.write("Outprefix can not be empty!\n") + sys.exit() + +if options.outdir != None and not os.path.isdir(options.outdir): + sys.stderr.write("Output folder does not exist!\n") + sys.exit() +elif options.outdir == None: + options.outdir = "" +else: + options.outdir = options.outdir.rstrip('/')+'/' + +################################ +## PROCESS FILES +################################ + +## CREATE OUTPUT FILE(S)/STREAM +fileflags = 'wb' +if options.SAM: fileflags = 'w' + +files = args +if options.pipe: files = [None] + +outfile = None +count_all = 0 +count_fkey = 0 +count_merged = 0 +count_merged_overlap = 0 +count_trimmed = 0 +count_nothing = 0 +count_chimera = 0 + +for filename in files: + if filename == None: + infile = pysam.Samfile( "-", 'rb' ) + else: + infile = pysam.Samfile( filename, 'rb' ) + + cheader = infile.header + if ('HD' in cheader) and ('SO' in cheader['HD']) and (cheader['HD']['SO'] != 'queryname'): + sys.stderr.write("Input BAM has wrong sorting order. Skipping input...\n") + continue + + if outfile == None: + if options.verbose: sys.stderr.write("Creating output files/streams...\n") + + if options.pipe: + if len(cheader) == 0: + cheader['HD'] = {'VN': '1.0'} + outfile = pysam.Samfile( "-", fileflags, header = cheader) + if options.verbose: sys.stderr.write("BAM/SAM output on stdout...\n") + else: + outfilename = options.outdir+options.outprefix+".bam" + if options.verbose: sys.stderr.write("Creating: %s\n"%outfilename) + outfile = pysam.Samfile( outfilename, fileflags, header = cheader) + + bread = pysam.AlignedRead() + for read in infile: + if read.is_paired and bread.qname == "": + bread = read + continue + elif read.is_paired and bread.qname != "": + if read.qname == bread.qname: + count_all += 1 + # SAVE SEQUENCES IN MERGING VARIABLES + if bread.is_read1: + #sys.stderr.write("%s %s\n"%(bread.is_reverse,read.is_reverse)) + #sys.stderr.write("+ %s\n %s\n"%(bread.seq,read.seq)) + if bread.is_reverse: + read1 = revcompl(bread.seq) + qual1 = bread.qual[::-1] + else: + read1 = bread.seq + qual1 = bread.qual + if read.is_reverse: + read2 = revcompl(read.seq) + qual2 = read.qual[::-1] + else: + read2 = read.seq + qual2 = read.qual + #sys.stderr.write("- %s\n %s\n"%(read1,read2)) + + else: + #sys.stderr.write("%s %s\n"%(read.is_reverse,bread.is_reverse)) + #sys.stderr.write("+ %s\n %s\n"%(read.seq,bread.seq)) + if read.is_reverse: + read1 = revcompl(read.seq) + qual1 = read.qual[::-1] + else: + read1 = read.seq + qual1 = read.qual + if bread.is_reverse: + read2 = revcompl(bread.seq) + qual2 = bread.qual[::-1] + else: + read2 = bread.seq + qual2 = bread.qual + #sys.stderr.write("- %s\n %s\n"%(read1,read2)) + + # IF WE HAVE NO QUALITY SCORES, ASSUME EQUAL QUALITY SCORES OF 15 + if qual1 == "*": qual1 = len(read1)*"0" + if qual2 == "*": qual2 = len(read2)*"0" + + flag,newseq,newqual = process_PE(read1,qual1,read2,qual2) + + if flag != '': + bread.is_qcfail = True + bread.tags = add_quality_flag(bread.tags,flag) + read.is_qcfail = True + read.tags = add_quality_flag(read.tags,flag) + if flag == 'K': count_fkey += 1 + elif flag == 'D': count_chimera += 1 + + if newseq != '': + if flag == '': + if len(newseq) > max(len(read1),len(read2)): count_merged_overlap += 1 + else: count_merged += 1 + new = create_new_read(newseq,newqual,bread,read) + outfile.write(new) + else: + if options.insertOne and flag == '': + success = False + + # SET OPTIONS TO DEACTIVATE OVERLAP MERGING + set_options(options.trimCutoff, options.allowMissing, False, False,options.minoverlap-1) + + # CHECK WHETHER IT ACTUALLY COULD BE AN INDEL PROBLEM + segment = False + if len(read1) <= len(read2): + cutpoint = len(read1)/2 + nread1 = read1[cutpoint:] + nqual1 = qual1[cutpoint:] + flag_,newseq_,newqual_ = process_PE(nread1,nqual1,read2,qual2) + #sys.stderr.write("1a: %s\n %s\n"%(nread1,revcompl(read2))) + if newseq_ != "": segment = True + else: + nread1 = read1[:cutpoint] + nqual1 = qual1[:cutpoint] + flag_,newseq_,newqual_ = process_PE(nread1,nqual1,read2,qual2) + #sys.stderr.write("4a: %s\n %s\n"%(nread1,revcompl(read2))) + if newseq_ != "": segment = True + else: + cutpoint = len(read2)/2 + nread2 = read2[cutpoint:] + nqual2 = qual2[cutpoint:] + flag_,newseq_,newqual_ = process_PE(read1,qual1,nread2,nqual2) + #sys.stderr.write("1b: %s\n %s\n"%(read1,revcompl(nread2))) + if newseq_ != "": segment = True + else: + nread2 = read2[:cutpoint] + nqual2 = qual2[:cutpoint] + #sys.stderr.write("4b: %s\n %s\n"%(read1,revcompl(nread2))) + flag_,newseq_,newqual_ = process_PE(read1,qual1,nread2,nqual2) + if newseq_ != "": segment = True + + if segment: + for i in range(len(read1)): # CHECK INSERTION IN FORWARD READ + nread1 = read1[:i]+"I"+read1[i:] + nqual1 = qual1[:i]+"!"+qual1[i:] + flag_,newseq_,newqual_ = process_PE(nread1,nqual1,read2,qual2) + + if flag_ != '': + bread.is_qcfail = True + bread.tags = add_quality_flag(bread.tags,flag_) + read.is_qcfail = True + read.tags = add_quality_flag(read.tags,flag_) + if flag_ == 'K': count_fkey += 1 + elif flag_ == 'D': count_chimera += 1 + outfile.write(bread) + outfile.write(read) + success = True + break + elif newseq_ != '': + #sys.stderr.write("Index of insert in forward read: %d of %d\n"%(i,len(read1))) + if flag_ == '': + if len(newseq_) > max(len(read1)+1,len(read2)): count_merged_overlap += 1 + else: count_merged += 1 + new = create_new_read(newseq_,newqual_,bread,read) + outfile.write(new) + success = True + break + + if not success: + for i in range(len(read2)): # CHECK INSERTION IN REVERSE READ + nread2 = read2[:i]+"I"+read2[i:] + nqual2 = qual2[:i]+"!"+qual2[i:] + flag_,newseq_,newqual_ = process_PE(read1,qual1,nread2,nqual2) + + if flag_ != '': + bread.is_qcfail = True + bread.tags = add_quality_flag(bread.tags,flag_) + read.is_qcfail = True + read.tags = add_quality_flag(read.tags,flag_) + if flag_ == 'K': count_fkey += 1 + elif flag_ == 'D': count_chimera += 1 + outfile.write(bread) + outfile.write(read) + success = True + break + elif newseq_ != '': + #sys.stderr.write("Index of insert in reverse read: %d of %d\n"%(i,len(read2))) + if flag_ == '': + if len(newseq_) > max(len(read1),len(read2)+1): count_merged_overlap += 1 + else: count_merged += 1 + new = create_new_read(newseq_,newqual_,bread,read) + outfile.write(new) + success = True + break + + if not success: + if flag == '': count_nothing += 1 + outfile.write(bread) + outfile.write(read) + # RESET OPTIONS + set_options(options.trimCutoff, options.allowMissing, options.mergeoverlap, options.onlyoverlap, options.minoverlap-1) + else: + if flag == '': count_nothing += 1 + outfile.write(bread) + outfile.write(read) + bread = pysam.AlignedRead() + else: + if options.verbose: + sys.stderr.write('Warning: Skipping read from incomplete pair\n') + bread = read + else: # Single End read + count_all += 1 + if options.noSecondTrim and read.qname.startswith("T_"): + count_nothing += 1 + outfile.write(read) + continue + + read1 = read.seq + qual1 = read.qual + if qual1 == "*": qual1 = len(read1)*"0" + flag,newseq,newqual = process_SR(read1,qual1) + + if flag != '': + read.is_qcfail = True + read.tags = add_quality_flag(read.tags,flag) + if flag == 'K': count_fkey += 1 + elif flag == 'D': count_chimera += 1 + + if newseq != '': + if not read.qname.startswith("T_"): read.qname = "T_"+read.qname + read.seq = newseq + read.qual = newqual + read.is_unmapped = True + read.pos = -1 + read.mpos = -1 + read.cigar = None + read.mapq = 0 + read.tid = 0 + read.mrnm = 0 + new_tags = [] + for tag,value in read.tags: + if tag != "NM" and tag != "MD": new_tags.append((tag,value)) + read.tags = new_tags + if flag == '': count_trimmed += 1 + outfile.write(read) + else: + if flag == '': count_nothing += 1 + outfile.write(read) + + if options.verbose and (count_all % 10000 == 0) and (count_all > 0): + sys.stderr.write("<==> Total %d; Merged (trimming) %d; Merged (overlap) %d; Kept PE/SR %d; Trimmed SR %d; Adapter dimers/chimeras %d; Failed Key %d\n"%(count_all,count_merged,count_merged_overlap,count_nothing,count_trimmed,count_chimera,count_fkey)) + + if not options.pipe: + infile.close() + +sys.stderr.write("Total %d; Merged (trimming) %d; Merged (overlap) %d; Kept PE/SR %d; Trimmed SR %d; Adapter dimers/chimeras %d; Failed Key %d\n"%(count_all,count_merged,count_merged_overlap,count_nothing,count_trimmed,count_chimera,count_fkey)) +if outfile != None: + outfile.close() diff --git a/workflow/scripts/count/MergeTrimReads_python3.py b/workflow/scripts/count/MergeTrimReads_python3.py new file mode 100644 index 0000000..e5b55a9 --- /dev/null +++ b/workflow/scripts/count/MergeTrimReads_python3.py @@ -0,0 +1,574 @@ +#!/usr/bin/env python +# -*- coding: ASCII -*- + +""" + +Merge/Adapter trim reads stored in BAM + +:Author: Martin Kircher +:Contact: Martin.Kircher@eva.mpg.de +:Date: *02.01.2012 +:Type: module for C cross-compilation + + +Python3 Updated version +:Updated By: Ali Hassan +:Contact: alihassan1697@gmail.com +:Date: 16.01.2024 + +""" + +import sys, os +import math +import random +import string + +table = str.maketrans('TGCA','ACGT') # COMPLEMENT DNA + + +####################################### +# ARBITRARY PARAMETERS AND DEFAULTS +####################################### + +offset = 33 +cutoff_merge_trim = 0.80 +cutoff_merge_seqs_early = 0.95 +cutoff_merge_seqs = 0.90 +options_min_overlap_seqs = 10 # corresponds n+1 bases overlap +max_prob_same_channel = 0.5 +max_prob_N = 0.25 +maxadapter_comp = 30 +min_length = 5 + +options_adapter_F=["AGATCGGAAGAGCACACGTCTGAACTCCAGTCACIIIIIIIATCTCGTATGCCGTCTTCTGCTTG"] +options_adapter_S=["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTT"] +options_adapter_chimera="ACACTCTTTCCCTACACGTCTGAACTCCAG,ACACTCTTTCCCACACGTCTGAACTCCAGT,ACACTCTTTCCCTACACACGTCTGAACTCC,CTCTTTCCCTACACGTCTGAACTCCAGTCA,GAAGAGCACACGTCTGAACTCCAGTCACII,GAGCACACGTCTGAACTCCAGTCACIIIII,GATCGGAAGAGCACACGTCTGAACTCCAGT,AGATCGGAAGAGCACACGTCTGAACTCCAG,AGAGCACACGTCTGAACTCCAGTCACIIII,ACACGTCTGAACTCCAGTCACIIIIIIIAT,GTGCACACGTCTGAACTCCAGTCACIIIII,AGCACACGTCTGAACTCCAGTCACIIIIII,CGTATGCCGTCTTCTGCTTGAAAAAAAAAA" +adapter_chimeras = (options_adapter_chimera).split(",") +options_trimCutoff = 1 +keys = ('','') +len_key1 = len(keys[0]) +len_key2 = len(keys[1]) +handle_key = (len_key1 > 0) or (len_key2 > 0) +options_allowMissing = False +options_mergeoverlap = False +options_onlyoverlap = False + +####################################### +# HELPER FUNCTIONS +####################################### + +# CALCULATE THE EDIT DISTANCE BETWEEN TWO READS +def edits(seq1,seq2): + lmin = min(len(seq1),len(seq2)) + lmax = max(len(seq1),len(seq2)) + dist = lmax-lmin + for pos in range(lmin): + if (seq1[pos] != seq2[pos]): dist+=1 + return dist + + +##CALCULATE QUALITY SCORE CORRECTED IDENTITY BETWEEN TWO READS +##FIRST READ DEFINES MAX LENGTH OF COMPARISION +def quality_ident(seq1,qual1,seq2,qual2=[],maxcomp=-1): + global max_prob_same_channel + global max_prob_N + ident = 0.0 + res = 0.0 + + if len(qual2) != 0 and len(seq2) < len(seq1): + hseq1 = seq1 + hqual1 = qual1 + qual1 = qual2 + seq1 = seq2 + qual2 = hqual1 + seq2 = hseq1 + + if (maxcomp < 0) or (maxcomp > len(seq1)): + maxcomp=len(seq1) + + if (len(seq1) >= maxcomp) and (len(seq2) >= maxcomp) and (len(qual1) >= maxcomp): + if len(qual2) == 0: # IN CASE WE DO NOT HAVE A SECOND QUALITY SCORE + for pos in range(maxcomp): + if (seq1[pos] != seq2[pos]) and (seq1[pos] != "I") and (seq2[pos] != "I"): + ident += 1.0-qual1[pos] + elif (seq1[pos] == "I") or (seq2[pos] == "I"): + maxcomp-=1 + else: + ident += 1.0 + #for pos in range(maxcomp): + #if (seq1[pos] != seq2[pos]): + #if (seq1[pos] == "A" and seq2[pos] == "C") or (seq1[pos] == "C" and seq2[pos] == "A") or (seq1[pos] == "G" and seq2[pos] == "T") or (seq1[pos] == "T" and seq2[pos] == "G"): + #ident += 1.0-min(max_prob_same_channel,qual1[pos]) + #elif (seq1[pos] == "I") or (seq2[pos] == "I"): + #ident += 1.0 + #elif (seq1[pos] == "N") or (seq2[pos] == "N"): + #ident += 1.0-min(max_prob_N,qual1[pos]) + #else: + #ident += 1.0-qual1[pos] + #else: + #ident += 1.0 + else: + for pos in range(maxcomp): + if (seq1[pos] != seq2[pos]) and (seq1[pos] != "I") and (seq2[pos] != "I"): + val = qual1[pos] if qual1[pos] < qual2[pos] else qual2[pos] + ident += 1.0-val + elif (seq1[pos] == "I") or (seq2[pos] == "I"): + maxcomp-=1 + else: + ident += 1.0 + else: + sys.stderr.write("Quality_ident: Call with invalid arguments!\n") + print((len(seq1) >= maxcomp),(len(seq2) >= maxcomp),(len(qual1) >= maxcomp),len(seq2) , maxcomp) + sys.exit() + return 0.0 + + if maxcomp > 0: + res = ident/float(maxcomp) + return res + +# CONVERTE QUALITY STRING TO LOG SCORES (INTS) +def convert_quality_logprob(qualstring): + global offset + return [ord(x)-offset for x in qualstring] + +# CONVERTE LOG SCORES TO QUALITY STRING +def convert_logprob_quality(probs): + global offset + return "".join([chr(max(33,min(126,x+offset))) for x in probs]) + +# CONVERTE LOG SCORES TO PROBABLILITIES FOR IDENTITY CALCULATION +def convert_logprob_prob(lprobs): + res = [] + for prob in lprobs: + val = 1.0-math.pow(10.0,prob/-10.0) + if val > max_prob_N: res.append(val) + else: res.append(max_prob_N) + return res + +# REVERSE COMPLEMENT A DNA SEQUENCE +def revcompl(seq): + global table + return "".join(seq.translate(table)[::-1]) + +# WARNING: NUMERIC INSTABLE CALCULATION WITH REAL NUMBERS +def cons_base_prob(base1,base2,prob1,prob2): + aprob1 = math.log10(1.0-prob1)-math.log10(3.0) + lprob1 = math.log10(prob1) + lprob2 = math.log10(prob2) + aprob2 = math.log10(1.0-prob2)-math.log10(3.0) + bases = [base1,base2] + if base1 == base2: bases = [base1] + + total_prob = 0.0 + for char_elem in 'ACGT': + help = 0.0 + if base1 == char_elem: help+=lprob1 + else: help+=aprob1 + if base2 == char_elem: help+=lprob2 + else: help+=aprob2 + total_prob+=(10**help) + total_prob = math.log10(total_prob) + call_base = 'N' + call_quality = -1 + for char_call in bases: + thelp = 0.0 + for char_elem in 'ACGT': + if char_elem != char_call: + help = 0.0 + if base1 == char_elem: help+=lprob1 + else: help+=aprob1 + if base2 == char_elem: help+=lprob2 + else: help+=aprob2 + thelp+=(10**help) + thelp = math.log10(thelp) + val = int(round(-10.0*(thelp-total_prob))) + hqual = 60 if val > 60 else val + #sys.stderr.write("\t".join([base1,base2,prob1,prob2,lprob1,lprob2,aprob1,aprob2,help,total_prob,hqual])+"\n") + if (hqual > call_quality) or ((hqual == call_quality) and (random.random() >=0.5)): + call_base = char_call + call_quality = hqual + #sys.stderr.write("\t".join([call_base,call_quality])+"\n") + return call_base,call_quality + + +# OUT SOURCED TEST FOR MERGING TRIMMED READS, ROBUST TO DIFFERENT LENGTH +def check_merge(read1,qual1,pqual1,read2,qual2,pqual2): + global cutoff_merge_seqs + global options_onlyoverlap + new_seq = "" + new_qual = [] + lread1 = len(read1) + lread2 = len(read2) + if (lread1 > 0) and (lread2 > 0): + oident = quality_ident(read2,qual2,read1,qual1) + if oident > cutoff_merge_seqs: + spos = -1 + new_lseq = list(read1) + new_qual = pqual1 + for pos in range(lread1,lread2): + new_lseq.append(read2[pos]) + new_qual.append(pqual2[pos]) + for pos in range(lread2): + if spos == -1 and read2[pos] != "I" and read1[pos] != "I": spos = pos + if (new_lseq[pos] == "I") or ((new_lseq[pos] == "N") and (read2[pos] != "N") and (read2[pos] != "I")): + new_lseq[pos] = read2[pos] + new_qual[pos] = pqual2[pos] + elif (pos < lread1) and (read1[pos] != "N") and (read2[pos] != "N") and (read1[pos] != "I") and (read2[pos] != "I"): + nbase,nqual = cons_base_prob(read1[pos],read2[pos],qual1[pos],qual2[pos]) + new_qual[pos] = nqual + new_lseq[pos] = nbase + if spos == -1 : spos = 0 + if options_onlyoverlap: + new_seq = "".join(new_lseq[spos:lread1+1]) + new_qual = new_qual[spos:lread1+1] + #if read2.startswith("I") or read1.startswith("I"): + #sys.stderr.write("OnlyOverlap mode %d %d!\n"%(spos,lread1+1)) + else: + #if read2.startswith("I") or read1.startswith("I"): + #sys.stderr.write("Not in OnlyOverlap mode %d %d!\n"%(spos,lread1+1)) + new_seq = "".join(new_lseq) + return new_seq,convert_logprob_quality(new_qual) + + +####################################### +# 'INTERFACE' FUNCTIONS +####################################### + +def set_options(trimcutoff=1,allowMissing=False,mergeoverlap=False,onlyoverlap=False, min_overlap_seqs=10): + global options_trimCutoff + global options_allowMissing + global options_mergeoverlap + global options_onlyoverlap + global options_min_overlap_seqs + options_trimCutoff = trimcutoff + options_allowMissing = allowMissing + options_mergeoverlap = mergeoverlap + options_onlyoverlap = onlyoverlap + options_min_overlap_seqs = min_overlap_seqs + +def set_adapter_sequences(forward='',reverse='',chimera='',max_comp=30): + global options_adapter_F, options_adapter_S + global options_adapter_chimera + global adapter_chimeras + global maxadapter_comp + options_adapter_F = (forward.upper()).split(',') + options_adapter_S = reverse.upper().split(',') + options_adapter_chimera = chimera.upper() + adapter_chimeras = (options_adapter_chimera).split(",") + for adapter_F in options_adapter_F: + if adapter_F not in adapter_chimeras: adapter_chimeras.append(adapter_F) + helpremove = [] + for ind in range(len(adapter_chimeras)): + adapter_chimeras[ind]=adapter_chimeras[ind].strip() + if (len(adapter_chimeras[ind]) > 0) and (len(adapter_chimeras[ind]) < maxadapter_comp+1): + #print "Extending Chimera sequence by Is (sequence shorter than maxadapter_comp)" + while (len(adapter_chimeras[ind]) < (maxadapter_comp+1)): + adapter_chimeras[ind] += "I" + elif (len(adapter_chimeras[ind]) == 0): + helpremove.append(ind) + for ind in helpremove[::-1]: + adapter_chimeras.pop(ind) + + for i in range(len(options_adapter_F)): + if (len(options_adapter_F[i]) < maxadapter_comp): + #print "Extending first adapter by Is (Adapter shorter than read)" + while (len(options_adapter_F[i]) < maxadapter_comp): + options_adapter_F[i] += "I" + + for i in range(len(options_adapter_S)): + if (len(options_adapter_S[i]) < maxadapter_comp): + #print "Extending second adapter by Is (Adapter shorter than read)" + while (len(options_adapter_S[i]) < maxadapter_comp): + options_adapter_S[i] += "I" + +def set_keys(key_text): + global keys + global len_key1 + global len_key2 + global handle_key + key_text = key_text.upper() + if key_text.count(",") == 1: + keys = (key_text.split(',')[0],key_text.split(',')[1]) + elif key_text.count(",") == 0: + keys = (key_text,key_text) + else: + sys.stderr.write("Unexpected number of keys specified.\n") + return False + len_key1 = len(keys[0]) + len_key2 = len(keys[1]) + handle_key = (len_key1 > 0) or (len_key2 > 0) + return True + + +def process_PE(read1,qual1,read2,qual2): + if len(read1) != len(qual1) or len(read2) != len(qual2): + sys.stderr.write("Error: Length of quality score and read strings do not match!\n") + return '','','' + + # CHECK KEY, IF KEY SEQUENCE SPECIFIED AND MATCHES REMOVE KEY FROM BOTH ENDS + if handle_key and len(read1) > 0: + if ((read1[:len_key1] == keys[0]) and (read2[:len_key2] == keys[1])) or (options_allowMissing and (edits(read1[:len_key1],keys[0]) == 1) and (read2[:len_key2] == keys[1])) or (options_allowMissing and (read1[:len_key1] == keys[0]) and (edits(read2[:len_key2],keys[1]) == 1)): + read1 = read1[len_key1:] + qual1 = qual1[len_key1:] + read2 = read2[len_key2:] + qual2 = qual2[len_key2:] + elif options_allowMissing and (read1[:len_key1-1] == keys[0][1:]) and (read2[:len_key2] == keys[1]): + read1 = read1[len_key1-1:] + qual1 = qual1[len_key1-1:] + read2 = read2[len_key2:] + qual2 = qual2[len_key2:] + elif options_allowMissing and (read1[:len_key1] == keys[0]) and (read2[:len_key2-1] == keys[1][1:]): + read1 = read1[len_key1:] + qual1 = qual1[len_key1:] + read2 = read2[len_key2-1:] + qual2 = qual2[len_key2-1:] + else: + return 'K','','' + + # CHECK ADAPTER CHIMERAS + if len(adapter_chimeras) > 0: + lqual1 = convert_quality_logprob(qual1) + pqual1 = convert_logprob_prob(lqual1) + lread1 = len(read1) + for chimera in adapter_chimeras: + if quality_ident(read1,pqual1,chimera,maxcomp=maxadapter_comp) > cutoff_merge_trim: + read1 = "" + read2 = "" + break + elif (quality_ident(read1,pqual1,chimera[1:],maxcomp=maxadapter_comp) > cutoff_merge_trim): # CONSIDER LOSING FIRST BASE... + read1 = "" + read2 = "" + break + if read1 == "" or read2 == "": + return 'D','','' + + lread1 = len(read1) + lread2 = len(read2) + ## CONVERTE QUALITY STRINGS TO PROBABILIITIES... + lqual1 = convert_quality_logprob(qual1) + pqual1 = convert_logprob_prob(lqual1) + lqual2 = convert_quality_logprob(qual2) + pqual2 = convert_logprob_prob(lqual2) + + rread2 = revcompl(read2) + rqual2 = list(pqual2) + rqual2.reverse() + rlqual2 = list(lqual2) + rlqual2.reverse() + mlength = min(lread1,lread2) + clength = max(lread1,lread2) + + ## CHECK FOR MERGING WITH ADAPTERS ... + have_merged = False + for start in range(mlength+1)[::-1]: ## COMMON OVERLAP + cval1 = 0.0 + for i in range(len(options_adapter_F)): + hcval = quality_ident(read1[start:],pqual1[start:],options_adapter_F[i],maxcomp=maxadapter_comp) + if hcval > cval1: cval1 = hcval + cval2 = 0.0 + for i in range(len(options_adapter_S)): + hcval = quality_ident(read2[start:],pqual2[start:],options_adapter_S[i],maxcomp=maxadapter_comp) + if hcval > cval2: cval2 = hcval + + new_seq,new_qual = check_merge(read1[:start],pqual1[:start],lqual1[:start],rread2[max(0,lread2-start):],rqual2[max(0,lread2-start):],rlqual2[max(0,lread2-start):]) + if (new_seq != "") and (cval1 > cutoff_merge_trim or cval2 > cutoff_merge_trim): + have_merged = True + return '',new_seq,new_qual + + if not have_merged: + for start in range(mlength+1,clength+1): ## SEQUENCE LEFT FOR ASYMMETRIC READ LENGTH + cval1 = 0.0 + for i in range(len(options_adapter_F)): + hcval = quality_ident(read1[start:],pqual1[start:],options_adapter_F[i],maxcomp=maxadapter_comp) + if hcval > cval1: cval1 = hcval + cval2 = 0.0 + for i in range(len(options_adapter_S)): + hcval = quality_ident(read2[start:],pqual2[start:],options_adapter_S[i],maxcomp=maxadapter_comp) + if hcval > cval2: cval2 = hcval + + help_rread2 = "" + help_lqual2 = [] + help_qual2 = [] + for ipos in range(max(0,start-lread2)): + help_rread2 += "I" + help_qual2.append(0.0) + help_lqual2.append(0) + new_seq,new_qual = check_merge(read1[:start],pqual1[:start],lqual1[:start],help_rread2+rread2[max(0,lread2-start):],help_qual2+rqual2[max(0,lread2-start):],help_lqual2+rlqual2[max(0,lread2-start):]) + if (new_seq != "") and ((cval1 > cutoff_merge_trim or cval2 > cutoff_merge_trim) or ((read1[start:]==read2[start:]) and (len(read1[start:]) == 0))): + have_merged = True + if len(new_seq) < min_length: + return 'D','','' + else: + return '',new_seq,new_qual + break + + if not have_merged and options_mergeoverlap: + # SO FAR NOTHING. INSERT MIGHT BE LONGER THAN READ LENGTH. TRY TO MERGE SEQUENCES WITH OVERLAP + max_value1,max_pos1 = -1,-1 + for start in range(lread1-options_min_overlap_seqs): + if lread1-start <= lread2: + cval = quality_ident(read1[start:],pqual1[start:],rread2,rqual2) + if cval > cutoff_merge_seqs_early: + max_value1,max_pos1=cval,start + break + elif (max_value1 == -1) or (cval > max_value1): + max_value1,max_pos1=cval,start + + # ABOVE CUTOFF? + if (max_value1 > cutoff_merge_seqs): + if options_onlyoverlap: + new_lseq = list(read1[max_pos1:mlength]) + new_lqual = lqual1[max_pos1:mlength] + for pos in range(mlength-max_pos1): + new_lseq[pos],new_lqual[pos] = cons_base_prob(new_lseq[pos],read1[max_pos1+pos],rqual2[pos],pqual1[max_pos1+pos]) + else: + new_lseq = list(read1[:max_pos1]+rread2) + new_lqual = lqual1[:max_pos1]+rlqual2 + for pos in range(max_pos1,mlength): + #new_lseq[pos],new_lqual[pos] = cons_base_prob(new_lseq[pos],read1[pos],rqual2[pos],pqual1[pos]) + new_lseq[pos],new_lqual[pos] = cons_base_prob(new_lseq[pos],read1[pos],rqual2[pos-max_pos1],pqual1[pos]) + return '',"".join(new_lseq),convert_logprob_quality(new_lqual) + return '','','' + + +def overlap_reads(read1,qual1,read2,qual2): + if len(read1) != len(qual1) or len(read2) != len(qual2): + sys.stderr.write("Error: Length of quality score and read strings do not match!\n") + return '','','' + + lread1 = len(read1) + lread2 = len(read2) + ## CONVERTE QUALITY STRINGS TO PROBABILIITIES... + lqual1 = convert_quality_logprob(qual1) + pqual1 = convert_logprob_prob(lqual1) + lqual2 = convert_quality_logprob(qual2) + pqual2 = convert_logprob_prob(lqual2) + + rread2 = revcompl(read2) + rqual2 = list(pqual2) + rqual2.reverse() + rlqual2 = list(lqual2) + rlqual2.reverse() + mlength = min(lread1,lread2) + + max_value1,max_pos1 = -1,-1 + for start in range(mlength-options_min_overlap_seqs): + if lread2-start <= lread1: + cval = quality_ident(read1,pqual1,rread2[start:],rqual2[start:]) + if cval > cutoff_merge_seqs_early: + max_value1,max_pos1=cval,start + break + elif (max_value1 == -1) or (cval > max_value1): + max_value1,max_pos1=cval,start + + # ABOVE CUTOFF? + if (max_value1 > cutoff_merge_seqs): + new_lseq = list(rread2[max_pos1:mlength]) + new_lqual = rlqual2[max_pos1:mlength] + for pos in range(mlength-max_pos1): + new_lseq[pos],new_lqual[pos] = cons_base_prob(new_lseq[pos],read1[pos],rqual2[max_pos1+pos],pqual1[pos]) + return '',"".join(new_lseq),convert_logprob_quality(new_lqual) + else: + return '','','' + + +def consensus_reads(read1,qual1,read2,qual2): + if len(read1) != len(qual1) or len(read2) != len(qual2) or len(read1) != len(read2): + sys.stderr.write("Error: Length of quality score and read strings do not match!\n") + return '','','' + + ## CONVERTE QUALITY STRINGS TO PROBABILIITIES... + lqual1 = convert_quality_logprob(qual1) + pqual1 = convert_logprob_prob(lqual1) + lqual2 = convert_quality_logprob(qual2) + pqual2 = convert_logprob_prob(lqual2) + + cval = quality_ident(read1,pqual1,read2,pqual2) + if cval > 0.75: + new_lseq = list(read2) + new_lqual = lqual2 + for pos in range(len(read1)): + new_lseq[pos],new_lqual[pos] = cons_base_prob(new_lseq[pos],read1[pos],pqual2[pos],pqual1[pos]) + return '',"".join(new_lseq),convert_logprob_quality(new_lqual) + else: + return '','','' + + +def process_SR(read1,qual1): + if len(read1) != len(qual1): + sys.stderr.write("Error: Length of quality score and read strings do not match!\n") + return '','','' + + if handle_key: + key_handled = False + if (read1[:len_key1] == keys[0]) or (options_allowMissing and (edits(read1[:len_key1],keys[0]) == 1)): + read1 = read1[len_key1:] + qual1 = qual1[len_key1:] + key_handled = True + elif options_allowMissing and (read1[:len_key1-1] == keys[0][1:]): + read1 = read1[len_key1-1:] + qual1 = qual1[len_key1-1:] + key_handled = True + if not key_handled: + return 'K','','' + + # CONVERTE QUALITY STRING TO PROBABILIITIES... + lqual1 = convert_quality_logprob(qual1) + pqual1 = convert_logprob_prob(lqual1) + lread1 = len(read1) + + # CHECK WHETHER WE (STILL) HAVE A SEQUENCE + if len(adapter_chimeras) > 0: + for chimera in adapter_chimeras: + if (quality_ident(read1,pqual1,chimera,maxcomp=maxadapter_comp) > cutoff_merge_trim): + read1 = "" + break + elif (quality_ident(read1,pqual1,chimera[1:],maxcomp=maxadapter_comp-1) > cutoff_merge_trim): # CONSIDER LOSING FIRST BASE... + read1 = "" + break + if read1 == "": + return 'D','','' + + adapter_pos = -2 + # CHECK FOR ADAPTER OF READ11... + max_value,max_pos = -1,-1 + for start in range(lread1): + cval = 0.0 + for i in range(len(options_adapter_F)): + hcval = quality_ident(read1[start:],pqual1[start:],options_adapter_F[i],maxcomp=maxadapter_comp) + if hcval > cval: cval = hcval + + if (cval > cutoff_merge_seqs_early): + max_value,max_pos=cval,start + break + elif ((cval > cutoff_merge_trim) and (cval > max_value) and (lread1-start > min_length)) or ((lread1-start <= min_length) and (max_value == -1)): + max_value,max_pos=cval,start + + if (max_value > cutoff_merge_trim) and ((lread1-max_pos) >= options_trimCutoff): # ADAPTER FOUND + adapter_pos = max_pos + read1 = read1[:adapter_pos] + if (len(read1) < min_length): + return 'D','','' + else: + return '',read1,qual1[:adapter_pos] + return '','','' + +if __name__ == '__main__': + set_adapter_sequences(",".join(options_adapter_F),",".join(options_adapter_S),options_adapter_chimera,maxadapter_comp) + set_options(options_trimCutoff,options_allowMissing,True,False) + set_keys(',') + print("Read cosensus",consensus_reads('ACGTACGTACGT','000000000000','ACGTACGTCCGT','000000000000')) + print("SR",process_SR('ACGTACGTACGT','000000000000')) + print("PE fail",process_PE('ACGTACGTACGT','000000000000','TGCATGCATGCA','000000000000')) + print("PE no adapter",process_PE('ACGTACGTACGT','000000000000','ACGTACGTACGT','000000000000')) + print("PE adapter",process_PE('ATAAACATATGGCAAACATGGTTCTAGATCGGAAGAGCACACGT','00000000000000000000000000000000000000000000','AGAACCATGTTTGCCATATGTTTATAGATCGGAAGAGCGTCGTG','00000000000000000000000000000000000000000000')) + print("PE overlap consensus", process_PE('ATAAACATATGGCAAACATGGTTCTAAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAATCACTAACACATTCATTTTCCAGACACCCTTCACACTACCGTCGGATCGTGCGTGTACCTCTGAATCTCGTATGCCGTCTTCT', '55???BBBDDDDDDDDFFFFF>EEHBF?EFFFFG@FHHHHHHHHHHHH@GHHHHHHHHHHHHHHF?FFHEGDG=EHG?FFGCFGBGHHHFGCFCFHHHHHBD.?C--@=<+EF@GGHHHHFHFHHHFFHHHHHHFFHHHHHGHHHHHHHHHHHHHHH-A-CFF,C//AECFD?EEFDFGH/CAFHF/ACFHDFFEEEHDHEEHBF?EFFFFG@FHHHHHHHHHHHH@GHHHHHHHHHHHHHHF?FFHEGDG=EHG?FFGCFGBGHHHFGCFCFHHHHHBD.?C--@=<+EF@GGHHHHFHFHHHFFHHHHHHFFHHHHHGHHHHHHHHHHHHHHH-A-CFF,C//AECFD?EEFDFGH/CAFHF/ACFHDFFEEEHDHCCEEE>CCCCCEEFFGFFFFFFDFFEFEEEECCDDCE+ACEDD5CEDEEECDE@D=9E+CDD@;@DE##113D9####00###21*28@@27C7>EEFGH=CDEGGHHHDE##################################################################################")) + print() + print(process_PE("TTTACGGCTCATTGCGTGAACCGAAGCTCATCAAGATCTGGCCTCGGCGGCCAAGCTTAGTCGCCTATACGGTGATGGGTGCATNNTTCATNNNNATNNNCTCCCCGTTTAATCCCATATCTCGTATGCCGTCTTCTGCTTGAAAAAAAA","=======++5<5@9@@>CCEEE>CCCCCEEFFGFFFFFFDFFEFEEEECCDDCE+ACEDD5CEDEEECDE@D=9E+CDD@;@DE##113D9####00###21*28@@27C7>EEFGH=CDEGGHHHDE##################################################################################")) diff --git a/workflow/scripts/count/library_python3.py b/workflow/scripts/count/library_python3.py new file mode 100644 index 0000000..4cdc63a --- /dev/null +++ b/workflow/scripts/count/library_python3.py @@ -0,0 +1,245 @@ +import sys +import os +import math +import subprocess + +""" +Just some frequently used functions stripped from the scripts + +:Author: Michael Siebauer +:Contact: michael_siebauer@eva.mpg.de +:Date: 02.07.2010 +:Type: library + +Python3 Updated version +:Updated By: Ali Hassan +:Contact: alihassan1697@gmail.com +:Date: 16.01.2024 + +""" + +def is_fastq(filehandle): + line = filehandle.readline() + filehandle.seek(0) + return line.startswith("@") + + +def get_filter_IDS(options): + if (options.filter != None) and os.path.exists(options.filter): + filterIDs = set() + infile = open(options.filter) + for line in infile: + filterIDs.add(line.strip()) + infile.close() + if len(filterIDs) > 0: + sys.stderr.write("Read %i IDs for filter\n"%(len(filterIDs))) + return filterIDs + return None + +def move_partially(sourcef,targetf, is_mock=False): + print("Starting step-by-step moving because "+targetf+" already exists.") + folders_to_iterate = [(sourcef,targetf)] + while len(folders_to_iterate) > 0: + csource,ctarget = folders_to_iterate.pop() + files_to_move = [] + folders_to_move = [] + for elem in os.listdir(csource): + if os.path.exists(csource+elem): + files_to_move.append(csource+elem) + elif os.path.isdir(csource+elem) and not(os.path.isdir(ctarget+elem)): + folders_to_move.append(csource+elem) + elif os.path.isdir(csource+elem) and os.path.isdir(ctarget+elem): + folders_to_iterate.append((csource+elem+"/",ctarget+elem+"/")) + if len(files_to_move)>1: + print("Moving",len(files_to_move),"files to",ctarget) + if is_mock: + print("mv "+" ".join(files_to_move)+" "+ctarget) + else: + move = subprocess.Popen("mv "+" ".join(files_to_move)+" "+ctarget,shell=True) + move.wait() + if len(folders_to_move)>1: + print("Moving",len(folders_to_move),"folders to",ctarget) + if is_mock: + print("mv "+" ".join(folders_to_move)+" "+"/".join(ctarget.split("/")[:-1])+"/") + else: + move = subprocess.Popen("mv "+" ".join(folders_to_move)+" "+"/".join(ctarget.split("/")[:-1])+"/",shell=True) + move.wait() + if len(os.listdir(csource)) == 0 and not(is_mock): + move = subprocess.Popen("rmdir "+csource,shell=True) + move.wait() + + + +def make_output_filename(inputname, options, suffix): + if suffix: + if "." in inputname: + inputname = ".".join(inputname.split(".")[:-1])+suffix + else: + inputname = inputname + suffix + + if ("outdir" in vars(options) and options.outdir and os.path.isdir(options.outdir)): + inputname = inputname.split("/")[-1] + inputname = options.outdir.rstrip("/")+"/"+ inputname + return inputname + + +def read_fastq(filehandle): + """ Reads fastq and fasta entries from filehandle + supports multiline fasta and fastq + """ + count = 0 + seq = "" + id = "" + qual = None + is_fasta = False + for line in filehandle: + line = line.rstrip("\n\r") + if ( (count == 0) and (line.startswith("@") or line.startswith(">"))): # Read identifier + id = line[1:] + count+=1 + is_fasta = line.startswith(">") + + elif count == 1: # read sequence + seq = line + count+=1 + + elif count == 2: # multiple case: a) quality identifier (fastq), b) more sequence (fastq,fasta), c) next sequence identifier (fasta) + if line.startswith("+"): # case a) + id2 = line[1:] + if ( (len(id2) > 0) and (id2 != id)): # optional sanity check + sys.stderr.write("[NOTE] sequences identifier does not match quality identifier: " + id + " " + id2 + "\n") + count+=1 + + elif is_fasta and ( line.startswith(">") or line.startswith("@")) : # case c) + yield id, seq, None + id, seq, qual = None, None, None + count = 1 + id = line[1:] + is_fasta = line.startswith(">") + + else: # case b) + seq = seq + line + + elif count == 3: + if qual == None: + qual = line + else: + qual = qual + line + + if (len(qual) > len(seq)): # Another sanity check + sys.stderr.write("[NOTE] sequence and quality line differ in length\n") + if (len(qual) >= len(seq)): + count = 0 + yield id, seq, qual + id, seq, qual = None, None, None + else: + sys.stderr.write("Unexpected line:" + str(line.strip()) + "\n") + count = 0 + + if id and seq: + yield id, seq, qual + + # raise StopIteration + +def write_fastq(filehandle, id, seq, qual, compress=False): + if (not id.startswith("@")): + filehandle.write("@") + filehandle.write(id.rstrip("\n")+"\n") + filehandle.write(seq.rstrip("\n") + "\n") + filehandle.write("+") + if (not compress): + if id.startswith("@"): + filehandle.write(id[1:].rstrip("\n")) + else: + filehandle.write(id.rstrip("\n")) + filehandle.write("\n") + filehandle.write(qual.rstrip("\n")+"\n") + +def write_fasta(filehandle, id, seq): + if (not id.startswith(">")): + filehandle.write(">") + filehandle.write(id.rstrip("\n")+"\n") + max_size = 80 + seq = "".join(seq.split('\n')) + pos = 0 + while pos < len(seq): + filehandle.write(seq[pos:pos+max_size]+"\n") + pos += max_size + +def write_out(filehandle, id, seq, qual): + if qual: + write_fastq(filehandle, id, seq, qual) + else: + write_fasta(filehandle, id, seq) + + + +def parse_range_string(options, silent=False, write_to=sys.stdout): + start = 0 + end = None + try: + fields = options.range.strip().split("-") + start = max(int(fields[0])-1,0) + + if fields[1].upper() != "MAX": + end = int(fields[1]) + if end < start: end = None + if (not silent): + write_to.write("Set range parameter to:\n") + if end != None: + write_to.write("Start: " + str(start+1) + " End: " + str(end) + "\n") + else: + write_to.write("Start: " + str(start+1) + " End: MAX\n") + return (start, end) + except: + if (not silent): + write_to.write("Couldn't evaluate range parameter, falling back to default: 1-MAX\n") + start = 0 + end = None + return (start, end) + +def parse_rangestr(rangestr): + res = [] + fields = rangestr.split(',') + for elem in fields: + if "-" in elem: + se = elem.split('-') + if len(se) == 2: + try: + start = int(se[0]) + end = int(se[1]) + res.extend(list(range(start,end+1))) + except: return None + else: return None + else: + try: + pos = int(elem) + res.append(pos) + except: return None + if len(res) == 0: return None + else: + res = list(set(res)) + res.sort() + return res + + +def is_complex_comp(seq,cutoff): + counts = [] + for x in 'ACGT': + counts.append(seq.count(x)) + total = float(sum(counts)) + return (max(counts) <= cutoff*total) + +def is_complex_entropy(seq,cutoff): + return (entropy(seq) >= cutoff) + +def entropy(seq): + counts = [] + for x in 'ACGT': + counts.append(seq.count(x)) + total = float(sum(counts)) + entropy = 0 + for elem in counts: + if (total > 0) and (elem/total != 0): + entropy -= elem/total*math.log(elem/total,2) + return entropy