Skip to content

Commit

Permalink
feature: fastq-join implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Max Schubach committed Oct 26, 2022
1 parent 0c1f1fc commit aaf5315
Show file tree
Hide file tree
Showing 4 changed files with 313 additions and 31 deletions.
11 changes: 11 additions & 0 deletions workflow/envs/fastq_join.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- fastq-join=1.3.1
- python
- click

- htslib
45 changes: 14 additions & 31 deletions workflow/rules/assignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,47 +66,31 @@ rule assignment_fastq_split:
rule assignment_merge:
"""
Merge the FW,REV and BC fastq files into one.
Extract the index sequence from the middle and end of an Illumina run.
Separates reads for Paired End runs. Merge/Adapter trim reads stored in BAM.
Extract the index sequence and add it to the header.
"""
conda:
"../envs/python27.yaml"
"../envs/fastq_join.yaml"
input:
R1="results/assignment/{assignment}/fastq/splits/R1.split{split}.fastq.gz",
R2="results/assignment/{assignment}/fastq/splits/R2.split{split}.fastq.gz",
R3="results/assignment/{assignment}/fastq/splits/R3.split{split}.fastq.gz",
script_FastQ2doubleIndexBAM=getScript("count/FastQ2doubleIndexBAM.py"),
script_MergeTrimReadsBAM=getScript("count/MergeTrimReadsBAM.py"),
script=getScript("attachBCToFastQ.py"),
output:
bam=temp("results/assignment/{assignment}/bam/merge_split{split}.bam"),
un1=temp("results/assignment/{assignment}/fastq/merge_split{split}.un1.fastq.gz"),
un2=temp("results/assignment/{assignment}/fastq/merge_split{split}.un2.fastq.gz"),
join=temp("results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz"),
params:
bc_length=lambda wc: config["assignments"][wc.assignment]["bc_length"],
log:
temp("results/logs/assignment/merge.{assignment}.{split}.log"),
shell:
"""
set +o pipefail;
fwd_length=`zcat {input.R1} | head -2 | tail -1 | wc -c`;
fwd_length=$(expr $(($fwd_length-1)));
rev_start=$(expr $(($fwd_length+1+{params.bc_length})));
paste <( zcat {input.R1} ) <( zcat {input.R2} ) <( zcat {input.R3} ) | \
awk '{{
count+=1;
if ((count == 1) || (count == 3)) {{
print $1
}} else {{
print $1$2$3
}};
if (count == 4) {{
count=0
}}
}}' | \
python {input.script_FastQ2doubleIndexBAM} -p -s $rev_start -l {params.bc_length} -m 0 | \
python {input.script_MergeTrimReadsBAM} -c '' -f CATTGCGTGAACCGACAATTCGTCGAGGGACCTAATAAC -s AGTTGATCCGGTCCTAGGTCTAGAGCGGGCCCTGGCAGA --mergeoverlap -p > {output} 2> {log}
fastq-join \
<( python {input.script} -r {input.R1} -b {input.R2} ) \
<( python {input.script} -r {input.R3} -b {input.R2} ) \
-o >(gzip -c > {output.un1}) \
-o >(gzip -c > {output.un2}) \
-o >(gzip -c > {output.join}) &> {log}
"""


Expand Down Expand Up @@ -144,7 +128,7 @@ rule assignment_mapping:
Map the reads to the reference and sort.
"""
input:
bams="results/assignment/{assignment}/bam/merge_split{split}.bam",
reads="results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz",
reference="results/assignment/{assignment}/reference/reference.fa",
bwa_index=expand(
"results/assignment/{{assignment}}/reference/reference.fa.{ext}",
Expand All @@ -160,8 +144,7 @@ rule assignment_mapping:
shell:
"""
bwa mem -t {threads} -L 80 -M -C {input.reference} <(
samtools view -F 514 {input.bams} | \
awk 'BEGIN{{ OFS="\\n"; FS="\\t" }}{{ print "@"$1" "$12","$13,$10,"+",$11 }}';
gzip -dc {input.reads}
) | samtools sort -l 0 -@ {threads} > {output} 2> {log}
"""

Expand Down
36 changes: 36 additions & 0 deletions workflow/scripts/attachBCToFastQ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import click
from common import read_fastq
import gzip


def read_sequence_files(read_file, bc_file):
for read, bc in zip(read_fastq(read_file), read_fastq(bc_file)):
seqid_read, seq_read, qual_read = read
seqid_read = seqid_read.split(" ")[0]
seqid_bc, seq_bc, qual_bc = bc
seqid_bc = seqid_bc.split(" ")[0]
if seqid_read != seqid_bc:
raise Exception('Sequence IDs do not match: %s != %s' % (seqid_read, seqid_bc))
seqid = "%s XI:Z:%s,YI:Z:%s" % (seqid_read, seq_bc, qual_bc)
yield seqid, seq_read, qual_read
return


@click.command()
@click.option('--reads', '-r',
"read_file",
type=click.Path(exists=True, readable=True),
required=True)
@click.option('--barcodes', '-b',
"barcode_file",
type=click.Path(exists=True, readable=True),
required=True)
def cli(read_file, barcode_file):

with gzip.open(read_file, 'rt') as r_file, gzip.open(barcode_file, 'rt') as bc_file:
for seqid, seq, qual in read_sequence_files(r_file, bc_file):
click.echo("@%s\n%s\n+\n%s" % (seqid, seq, qual))


if __name__ == '__main__':
cli()
252 changes: 252 additions & 0 deletions workflow/scripts/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
import sys
import os
import math
import subprocess

"""
Just some frequently used functions stripped from the scripts
:Author: Michael Siebauer
:Contact: michael_siebauer@eva.mpg.de
:Date: 02.07.2010
:Type: library
"""


def is_fastq(filehandle):
line = filehandle.readline()
filehandle.seek(0)
return line.startswith("@")


def get_filter_IDS(options):
if (options.filter != None) and os.path.exists(options.filter):
filterIDs = set()
infile = open(options.filter)
for line in infile:
filterIDs.add(line.strip())
infile.close()
if len(filterIDs) > 0:
sys.stderr.write("Read %i IDs for filter\n" % (len(filterIDs)))
return filterIDs
return None


def move_partially(sourcef, targetf, is_mock=False):
print("Starting step-by-step moving because "+targetf+" already exists.")
folders_to_iterate = [(sourcef, targetf)]
while len(folders_to_iterate) > 0:
csource, ctarget = folders_to_iterate.pop()
files_to_move = []
folders_to_move = []
for elem in os.listdir(csource):
if os.path.exists(csource+elem):
files_to_move.append(csource+elem)
elif os.path.isdir(csource+elem) and not (os.path.isdir(ctarget+elem)):
folders_to_move.append(csource+elem)
elif os.path.isdir(csource+elem) and os.path.isdir(ctarget+elem):
folders_to_iterate.append((csource+elem+"/", ctarget+elem+"/"))
if len(files_to_move) > 1:
print("Moving", len(files_to_move), "files to", ctarget)
if is_mock:
print("mv "+" ".join(files_to_move)+" "+ctarget)
else:
move = subprocess.Popen("mv "+" ".join(files_to_move)+" "+ctarget, shell=True)
move.wait()
if len(folders_to_move) > 1:
print("Moving", len(folders_to_move), "folders to", ctarget)
if is_mock:
print("mv "+" ".join(folders_to_move)+" "+"/".join(ctarget.split("/")[:-1])+"/")
else:
move = subprocess.Popen("mv "+" ".join(folders_to_move)+" "+"/".join(ctarget.split("/")[:-1])+"/", shell=True)
move.wait()
if len(os.listdir(csource)) == 0 and not (is_mock):
move = subprocess.Popen("rmdir "+csource, shell=True)
move.wait()


def make_output_filename(inputname, options, suffix):
if suffix:
if "." in inputname:
inputname = ".".join(inputname.split(".")[:-1])+suffix
else:
inputname = inputname + suffix

if ("outdir" in vars(options) and options.outdir and os.path.isdir(options.outdir)):
inputname = inputname.split("/")[-1]
inputname = options.outdir.rstrip("/")+"/" + inputname
return inputname


def read_fastq(filehandle):
""" Reads fastq and fasta entries from filehandle
supports multiline fasta and fastq
"""
count = 0
seq = ""
id = ""
qual = None
is_fasta = False
for line in filehandle:
line = line.rstrip("\n\r")
if ((count == 0) and (line.startswith("@") or line.startswith(">"))): # Read identifier
id = line[1:]
count += 1
is_fasta = line.startswith(">")

elif count == 1: # read sequence
seq = line
count += 1

# multiple case: a) quality identifier (fastq), b) more sequence (fastq,fasta), c) next sequence identifier (fasta)
elif count == 2:
if line.startswith("+"): # case a)
id2 = line[1:]
if ((len(id2) > 0) and (id2 != id)): # optional sanity check
sys.stderr.write("[NOTE] sequences identifier does not match quality identifier: " + id + " " + id2 + "\n")
count += 1

elif is_fasta and (line.startswith(">") or line.startswith("@")): # case c)
yield id, seq, None
id, seq, qual = None, None, None
count = 1
id = line[1:]
is_fasta = line.startswith(">")

else: # case b)
seq = seq + line

elif count == 3:
if qual == None:
qual = line
else:
qual = qual + line

if (len(qual) > len(seq)): # Another sanity check
sys.stderr.write("[NOTE] sequence and quality line differ in length\n")
if (len(qual) >= len(seq)):
count = 0
yield id, seq, qual
id, seq, qual = None, None, None
else:
sys.stderr.write("Unexpected line:" + str(line.strip()) + "\n")
count = 0

if id and seq:
yield id, seq, qual

return


def write_fastq(filehandle, id, seq, qual, compress=False):
if (not id.startswith("@")):
filehandle.write("@")
filehandle.write(id.rstrip("\n")+"\n")
filehandle.write(seq.rstrip("\n") + "\n")
filehandle.write("+")
if (not compress):
if id.startswith("@"):
filehandle.write(id[1:].rstrip("\n"))
else:
filehandle.write(id.rstrip("\n"))
filehandle.write("\n")
filehandle.write(qual.rstrip("\n")+"\n")


def write_fasta(filehandle, id, seq):
if (not id.startswith(">")):
filehandle.write(">")
filehandle.write(id.rstrip("\n")+"\n")
max_size = 80
seq = "".join(seq.split('\n'))
pos = 0
while pos < len(seq):
filehandle.write(seq[pos:pos+max_size]+"\n")
pos += max_size


def write_out(filehandle, id, seq, qual):
if qual:
write_fastq(filehandle, id, seq, qual)
else:
write_fasta(filehandle, id, seq)


def parse_range_string(options, silent=False, write_to=sys.stdout):
start = 0
end = None
try:
fields = options.range.strip().split("-")
start = max(int(fields[0])-1, 0)

if fields[1].upper() != "MAX":
end = int(fields[1])
if end < start:
end = None
if (not silent):
write_to.write("Set range parameter to:\n")
if end != None:
write_to.write("Start: " + str(start+1) + " End: " + str(end) + "\n")
else:
write_to.write("Start: " + str(start+1) + " End: MAX\n")
return (start, end)
except:
if (not silent):
write_to.write("Couldn't evaluate range parameter, falling back to default: 1-MAX\n")
start = 0
end = None
return (start, end)


def parse_rangestr(rangestr):
res = []
fields = rangestr.split(',')
for elem in fields:
if "-" in elem:
se = elem.split('-')
if len(se) == 2:
try:
start = int(se[0])
end = int(se[1])
res.extend(range(start, end+1))
except:
return None
else:
return None
else:
try:
pos = int(elem)
res.append(pos)
except:
return None
if len(res) == 0:
return None
else:
res = list(set(res))
res.sort()
return res


def is_complex_comp(seq, cutoff):
counts = []
for x in 'ACGT':
counts.append(seq.count(x))
total = float(sum(counts))
return (max(counts) <= cutoff*total)


def is_complex_entropy(seq, cutoff):
return (entropy(seq) >= cutoff)


def entropy(seq):
counts = []
for x in 'ACGT':
counts.append(seq.count(x))
total = float(sum(counts))
entropy = 0
for elem in counts:
if (total > 0) and (elem/total != 0):
entropy -= elem/total*math.log(elem/total, 2)
return entropy

0 comments on commit aaf5315

Please sign in to comment.