Skip to content

Commit

Permalink
Added merging of paired-end reads
Browse files Browse the repository at this point in the history
before gene calling
Also several fixes in Summary Report: consider "Samples" in the rows of "Names", and count properly number of sequences annotated
  • Loading branch information
iquasere committed Jan 29, 2024
1 parent 762d2d1 commit 0a94e03
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 48 deletions.
8 changes: 8 additions & 0 deletions resources/minimum_envs/merge_reads_min.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: merge_reads
channels:
- conda-forge
- bioconda
- anaconda
- defaults
dependencies:
- pear
16 changes: 16 additions & 0 deletions workflow/envs/merge_reads.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: merge_reads
channels:
- conda-forge
- bioconda
- anaconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- bzip2=1.0.8=hd590300_5
- libgcc-ng=13.2.0=h807b86a_4
- libgomp=13.2.0=h807b86a_4
- libzlib=1.2.13=hd590300_5
- pear=0.9.6=h9d449c0_10
- zlib=1.2.13=hd590300_5
prefix: /opt/conda/envs/merge_reads
11 changes: 11 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,14 @@ def sample_to_reads(wildcards):
df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index()
return [f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}{fr}.fq' for row in range(len(df))
for fr in (['_forward_paired', '_reverse_paired'] if ',' in df.iloc[row]["Files"] else [''])]


def gene_calling_input(wildcards):
df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index()
result = []
for row in range(len(df)):
if ',' in df.iloc[row]["Files"]:
result.append(f'{OUTPUT}/Preprocess/Trimmomatic/{df.iloc[row]["Name"]}.assembled.fastq')
else:
result.append(f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}.fq')
return result
18 changes: 17 additions & 1 deletion workflow/rules/gene_calling.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
rule merge_reads:
input:
forward_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_forward_paired.fq",
reverse_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_reverse_paired.fq"
output:
expand("{output}/Preprocess/Trimmomatic/{{name}}.assembled.fastq", output=OUTPUT)
threads:
config["threads"]
params:
memory = config["max_memory"]
conda:
"../envs/merge_reads.yaml"
shell:
"pear -f {input.forward_reads} -r {input.reverse_reads} -o {OUTPUT}/Preprocess/Trimmomatic/{wildcards.name} "
"-j {threads} -y {params.memory}G"

rule fastq2fasta:
input:
sample_to_reads
gene_calling_input
output:
expand("{output}/Preprocess/piled_{{sample}}.fasta", output=OUTPUT)
threads:
Expand Down
5 changes: 3 additions & 2 deletions workflow/rules/general_report.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ rule general_report:
expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [],
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [],
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [],
expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [],
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else [],
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else [],
expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample']))
output:
expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
Expand All @@ -19,6 +19,7 @@ rule general_report:
params:
output = OUTPUT,
exps = f"{OUTPUT}/exps.tsv",
did_assembly = config['do_assembly']
conda:
"../envs/reports.yaml"
script:
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/quantification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ rule quantification:
output:
expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])),
expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [],
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [],
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [],
expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [],
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else []
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else []
threads:
config["threads"]
params:
Expand Down
23 changes: 14 additions & 9 deletions workflow/scripts/general_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
'General functional category', 'Functional category', 'Protein description', 'COG ID', 'EC number (reCOGnizer)']


def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input):
def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input, did_assembly=True):
timed_message(f'Joining data for sample: {sample}.')
with open(f'{out}/Annotation/{sample}/fgs.faa') as f:
lines = f.readlines()
Expand All @@ -32,18 +32,23 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
report = report.rename(columns={
**{f'{col}_x': f'{col} (UPIMAPI)' for col in rename_cols},
**{f'{col}_y': f'{col} (reCOGnizer)' for col in rename_cols}})
report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1])
if did_assembly:
report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1])
mg_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'dna')]['Name'].tolist()
mt_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].tolist()
mp_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].tolist()
if len(mg_names) > 0:
mg_counts = pd.read_csv(f'{out}/Quantification/{sample}_mg_norm.tsv', sep='\t')
mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1])
report = pd.merge(report, mg_counts, on='Contig', how='left')
mg_counts = pd.read_csv((f'{out}/Quantification/{sample}_mg_norm.tsv' if did_assembly else
f'{out}/Quantification/{sample}_mg.readcounts'), sep='\t', names=[
'qseqid'] + mg_names, skiprows=1)
if did_assembly:
mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1])
report = pd.merge(report, mg_counts, on=('Contig' if did_assembly else 'qseqid'), how='left')
mg_preport = pd.merge(mg_preport, report[['Entry'] + mg_names], on='Entry', how='outer')
if len(mt_names) > 0:
report = pd.merge(
report, pd.read_csv(f'{out}/Quantification/{sample}_mt_norm.tsv', sep='\t', names=[
report, pd.read_csv((f'{out}/Quantification/{sample}_mt_norm.tsv' if did_assembly else
f'{out}/Quantification/{sample}_mt.readcounts'), sep='\t', names=[
'qseqid'] + mt_names), on='qseqid', how='left')
mt_preport = pd.merge(mt_preport, report[['Entry'] + mt_names], on='Entry', how='outer')
# MT readcounts come normalized by gene size, and therefore not fit for DE (as they are floats).
Expand All @@ -62,12 +67,12 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
return report, mg_preport, mt_preport, mp_preport, de_input


def make_general_reports(out, exps, max_lines=1000000):
def make_general_reports(out, exps, max_lines=1000000, did_assembly=True):
mg_report = mt_report = mp_report = de_input = pd.DataFrame(columns=['Entry'])
writer = pd.ExcelWriter(f'{out}/MOSCA_General_Report.xlsx', engine='xlsxwriter')
for sample in set(exps['Sample']):
report, mg_report, mt_report, mp_report, de_input = make_general_report(
out, exps, sample, mg_report, mt_report, mp_report, de_input)
out, exps, sample, mg_report, mt_report, mp_report, de_input, did_assembly=did_assembly)
timed_message(f'Writing General Report for sample: {sample}.')
if len(report) < max_lines:
report.to_excel(writer, sheet_name=sample, index=False)
Expand Down Expand Up @@ -103,7 +108,7 @@ def make_general_reports(out, exps, max_lines=1000000):

def run():
exps = pd.read_csv(snakemake.params.exps, sep='\t')
make_general_reports(snakemake.params.output, exps)
make_general_reports(snakemake.params.output, exps, did_assembly=snakemake.params.did_assembly)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/mosca_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def perform_alignment(reference, reads, basename, threads=1):
print(f'{basename}.log was found!')
run_pipe_command(
f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | \
awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')


def fastq2fasta(fastq, output):
Expand Down
36 changes: 12 additions & 24 deletions workflow/scripts/quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@
from mosca_tools import perform_alignment, normalize_counts_by_size


def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
def quantification(exps: pd.DataFrame, output: str, sample: str, did_assembly: bool = True) -> tuple:
"""
Perform quantification of reads with contigs as reference
:param exps: DataFrame with the experiments
:param output: Output directory
:param sample: Sample name
:param did_assembly: Whether assembly was performed or not
"""
mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
pexps = exps[(exps['Sample'] == sample)]
for i in pexps.index:
if pexps.loc[i]['Data type'] == 'mrna':
if pexps.loc[i]['Data type'] == 'mrna' or not did_assembly:
reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
elif pexps.loc[i]['Data type'] == 'dna':
reference = f"{output}/Assembly/{pexps.loc[i]['Sample']}/contigs.fasta"
Expand Down Expand Up @@ -53,38 +54,25 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
return mg_result, mg_result_norm, mt_result, mt_result_norm


def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
pexps = exps[(exps['Sample'] == sample)]
for i in pexps.index:
if pexps.loc[i]['Data type'] in ['mrna', 'dna']:
reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
else:
continue
return mg_result, mg_result_norm, mt_result, mt_result_norm


def run():
exps = pd.read_csv(snakemake.params.exps, sep='\t')
did_assembly=snakemake.params.did_assembly

for sample in set(exps['Sample']):
if snakemake.params.did_assembly:
mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_with_assembly(
exps, snakemake.params.output, sample)
else:
mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_without_assembly(
exps, snakemake.params.output, sample)
mg_result, mg_result_norm, mt_result, mt_result_norm = quantification(
exps, snakemake.params.output, sample, did_assembly=did_assembly)
if len(mg_result) > 0:
mg_result.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
mg_result_norm.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
if did_assembly:
mg_result_norm.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
if len(mt_result) > 0:
mt_result.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
mt_result_norm.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
if did_assembly:
mt_result_norm.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)


if __name__ == '__main__':
Expand Down
23 changes: 14 additions & 9 deletions workflow/scripts/summary_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def write_versions_report(output):
class Reporter:
def __init__(self):
self.report = pd.DataFrame()
self.sample2name = {}

def info_from_preprocessing(self, out_dir):
timed_message('Processing results of: preprocessing.')
Expand All @@ -74,7 +75,7 @@ def info_from_assembly(self, out_dir):
sample = file.split('/')[-3]
self.report = pd.concat([self.report, pd.Series(name=sample, dtype='object')])
data = pd.read_csv(file, sep='\t', index_col='Assembly')
self.report.loc[sample, ['# contigs', 'N50', 'Reads aligned (%)']] = (
self.report.loc[self.sample2name[sample], ['# contigs', 'N50', 'Reads aligned (%)']] = (
int(data.loc['# contigs', 'contigs']),
int(data.loc['N50', 'contigs']),
data.loc['Reads aligned (%)', 'contigs'])
Expand All @@ -86,7 +87,7 @@ def info_from_binning(self, out_dir):
timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
sample = file.split('/')[-2]
data = pd.read_csv(file, sep='\t')
self.report.loc[sample, ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = (
self.report.loc[self.sample2name[sample], ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = (
((data['Completeness'] >= 90) & (data['Contamination'] <= 5)).sum(),
((data['Completeness'] >= 50) & (data['Completeness'] < 90) & (data['Contamination'] <= 10)).sum(),
((data['Completeness'] < 50) & (data['Contamination'] <= 10)).sum())
Expand All @@ -99,17 +100,17 @@ def info_from_annotation(self, out_dir):
for file in fastas:
timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
sample = file.split('/')[-2]
self.report.loc[sample, '# genes'] = count_on_file('>', file)
self.report.loc[self.sample2name[sample], '# genes'] = count_on_file('>', file)
for file in upimapi_res:
timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
sample = file.split('/')[-2]
self.report.loc[sample, '# annotations (UPIMAPI)'] = pd.read_csv(
file, sep='\t', low_memory=False)['qseqid'].unique().sum()
self.report.loc[self.sample2name[sample], '# annotations (UPIMAPI)'] = len(pd.read_csv(
file, sep='\t', low_memory=False)['qseqid'].unique())
for file in recognizer_res:
timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
sample = file.split('/')[-2]
self.report.loc[sample, '# annotations (reCOGnizer)'] = pd.read_csv(
file, sep='\t', low_memory=False)['qseqid'].unique().sum()
self.report.loc[self.sample2name[sample], '# annotations (reCOGnizer)'] = len(pd.read_csv(
file, sep='\t', low_memory=False)['qseqid'].unique())

def info_from_quantification(self, out_dir):
timed_message('Processing results of: MT quantification.')
Expand Down Expand Up @@ -149,15 +150,19 @@ def zip_outputs(self, out_dir):
def run(self):
timed_message('Writting final reports.')
write_versions_report(f'{snakemake.params.output}/MOSCA_Versions_Report.xlsx')
exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t')
for sample in exps['Sample'].unique():
self.sample2name = {sample: exps[exps['Sample'] == sample]['Name'].tolist()}
self.info_from_preprocessing(snakemake.params.output)
self.info_from_assembly(snakemake.params.output)
self.info_from_binning(snakemake.params.output)
self.info_from_annotation(snakemake.params.output)
self.info_from_quantification(snakemake.params.output)
exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t')
self.info_from_differential_expression(
snakemake.params.output, cutoff=snakemake.params.cutoff, mp='protein' in exps['Data type'].tolist())
self.report.to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t')
self.report[['Initial reads', 'Qual trim params', 'Final reads', '# genes', '# annotations (UPIMAPI)',
'# annotations (reCOGnizer)', 'Reads aligned (%)', '# differentially expressed'
]].to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t')
self.zip_outputs(snakemake.params.output)


Expand Down

0 comments on commit 0a94e03

Please sign in to comment.