diff --git a/resources/minimum_envs/merge_reads_min.yaml b/resources/minimum_envs/merge_reads_min.yaml new file mode 100644 index 0000000..716cefc --- /dev/null +++ b/resources/minimum_envs/merge_reads_min.yaml @@ -0,0 +1,8 @@ +name: merge_reads +channels: + - conda-forge + - bioconda + - anaconda + - defaults +dependencies: + - pear diff --git a/workflow/envs/merge_reads.yaml b/workflow/envs/merge_reads.yaml new file mode 100644 index 0000000..9556c5e --- /dev/null +++ b/workflow/envs/merge_reads.yaml @@ -0,0 +1,16 @@ +name: merge_reads +channels: + - conda-forge + - bioconda + - anaconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=hd590300_5 + - libgcc-ng=13.2.0=h807b86a_4 + - libgomp=13.2.0=h807b86a_4 + - libzlib=1.2.13=hd590300_5 + - pear=0.9.6=h9d449c0_10 + - zlib=1.2.13=hd590300_5 +prefix: /opt/conda/envs/merge_reads \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 40cf6cf..99cfbf4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -50,3 +50,14 @@ def sample_to_reads(wildcards): df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index() return [f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}{fr}.fq' for row in range(len(df)) for fr in (['_forward_paired', '_reverse_paired'] if ',' in df.iloc[row]["Files"] else [''])] + + +def gene_calling_input(wildcards): + df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index() + result = [] + for row in range(len(df)): + if ',' in df.iloc[row]["Files"]: + result.append(f'{OUTPUT}/Preprocess/Trimmomatic/{df.iloc[row]["Name"]}.assembled.fastq') + else: + result.append(f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}.fq') + return result \ No newline at end of file diff --git a/workflow/rules/gene_calling.smk b/workflow/rules/gene_calling.smk index 38961f7..fdc027b 100644 --- a/workflow/rules/gene_calling.smk +++ b/workflow/rules/gene_calling.smk @@ -1,6 +1,22 @@ +rule merge_reads: + input: + forward_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_forward_paired.fq", + reverse_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_reverse_paired.fq" + output: + expand("{output}/Preprocess/Trimmomatic/{{name}}.assembled.fastq", output=OUTPUT) + threads: + config["threads"] + params: + memory = config["max_memory"] + conda: + "../envs/merge_reads.yaml" + shell: + "pear -f {input.forward_reads} -r {input.reverse_reads} -o {OUTPUT}/Preprocess/Trimmomatic/{wildcards.name} " + "-j {threads} -y {params.memory}G" + rule fastq2fasta: input: - sample_to_reads + gene_calling_input output: expand("{output}/Preprocess/piled_{{sample}}.fasta", output=OUTPUT) threads: diff --git a/workflow/rules/general_report.smk b/workflow/rules/general_report.smk index 9500681..4fa7b33 100644 --- a/workflow/rules/general_report.smk +++ b/workflow/rules/general_report.smk @@ -3,9 +3,9 @@ rule general_report: expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])), expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])), expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [], - expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [], + expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [], expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [], - expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else [], + expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else [], expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample'])) output: expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), @@ -19,6 +19,7 @@ rule general_report: params: output = OUTPUT, exps = f"{OUTPUT}/exps.tsv", + did_assembly = config['do_assembly'] conda: "../envs/reports.yaml" script: diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 7fcb979..1e68307 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -7,9 +7,9 @@ rule quantification: output: expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])), expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [], - expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [], + expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [], expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [], - expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else [] + expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else [] threads: config["threads"] params: diff --git a/workflow/scripts/general_report.py b/workflow/scripts/general_report.py index 2d53666..6378f0a 100644 --- a/workflow/scripts/general_report.py +++ b/workflow/scripts/general_report.py @@ -14,7 +14,7 @@ 'General functional category', 'Functional category', 'Protein description', 'COG ID', 'EC number (reCOGnizer)'] -def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input): +def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input, did_assembly=True): timed_message(f'Joining data for sample: {sample}.') with open(f'{out}/Annotation/{sample}/fgs.faa') as f: lines = f.readlines() @@ -32,18 +32,23 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d report = report.rename(columns={ **{f'{col}_x': f'{col} (UPIMAPI)' for col in rename_cols}, **{f'{col}_y': f'{col} (reCOGnizer)' for col in rename_cols}}) - report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1]) + if did_assembly: + report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1]) mg_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'dna')]['Name'].tolist() mt_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].tolist() mp_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].tolist() if len(mg_names) > 0: - mg_counts = pd.read_csv(f'{out}/Quantification/{sample}_mg_norm.tsv', sep='\t') - mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1]) - report = pd.merge(report, mg_counts, on='Contig', how='left') + mg_counts = pd.read_csv((f'{out}/Quantification/{sample}_mg_norm.tsv' if did_assembly else + f'{out}/Quantification/{sample}_mg.readcounts'), sep='\t', names=[ + 'qseqid'] + mg_names, skiprows=1) + if did_assembly: + mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1]) + report = pd.merge(report, mg_counts, on=('Contig' if did_assembly else 'qseqid'), how='left') mg_preport = pd.merge(mg_preport, report[['Entry'] + mg_names], on='Entry', how='outer') if len(mt_names) > 0: report = pd.merge( - report, pd.read_csv(f'{out}/Quantification/{sample}_mt_norm.tsv', sep='\t', names=[ + report, pd.read_csv((f'{out}/Quantification/{sample}_mt_norm.tsv' if did_assembly else + f'{out}/Quantification/{sample}_mt.readcounts'), sep='\t', names=[ 'qseqid'] + mt_names), on='qseqid', how='left') mt_preport = pd.merge(mt_preport, report[['Entry'] + mt_names], on='Entry', how='outer') # MT readcounts come normalized by gene size, and therefore not fit for DE (as they are floats). @@ -62,12 +67,12 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d return report, mg_preport, mt_preport, mp_preport, de_input -def make_general_reports(out, exps, max_lines=1000000): +def make_general_reports(out, exps, max_lines=1000000, did_assembly=True): mg_report = mt_report = mp_report = de_input = pd.DataFrame(columns=['Entry']) writer = pd.ExcelWriter(f'{out}/MOSCA_General_Report.xlsx', engine='xlsxwriter') for sample in set(exps['Sample']): report, mg_report, mt_report, mp_report, de_input = make_general_report( - out, exps, sample, mg_report, mt_report, mp_report, de_input) + out, exps, sample, mg_report, mt_report, mp_report, de_input, did_assembly=did_assembly) timed_message(f'Writing General Report for sample: {sample}.') if len(report) < max_lines: report.to_excel(writer, sheet_name=sample, index=False) @@ -103,7 +108,7 @@ def make_general_reports(out, exps, max_lines=1000000): def run(): exps = pd.read_csv(snakemake.params.exps, sep='\t') - make_general_reports(snakemake.params.output, exps) + make_general_reports(snakemake.params.output, exps, did_assembly=snakemake.params.did_assembly) if __name__ == '__main__': diff --git a/workflow/scripts/mosca_tools.py b/workflow/scripts/mosca_tools.py index a63700c..ddfcc1e 100644 --- a/workflow/scripts/mosca_tools.py +++ b/workflow/scripts/mosca_tools.py @@ -143,7 +143,7 @@ def perform_alignment(reference, reads, basename, threads=1): print(f'{basename}.log was found!') run_pipe_command( f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | \ -awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts') + awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts') def fastq2fasta(fastq, output): diff --git a/workflow/scripts/quantification.py b/workflow/scripts/quantification.py index f68d3fb..a181c7f 100644 --- a/workflow/scripts/quantification.py +++ b/workflow/scripts/quantification.py @@ -11,18 +11,19 @@ from mosca_tools import perform_alignment, normalize_counts_by_size -def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple: +def quantification(exps: pd.DataFrame, output: str, sample: str, did_assembly: bool = True) -> tuple: """ Perform quantification of reads with contigs as reference :param exps: DataFrame with the experiments :param output: Output directory :param sample: Sample name + :param did_assembly: Whether assembly was performed or not """ mg_result = mg_result_norm = pd.DataFrame(columns=['Contig']) mt_result = mt_result_norm = pd.DataFrame(columns=['Gene']) pexps = exps[(exps['Sample'] == sample)] for i in pexps.index: - if pexps.loc[i]['Data type'] == 'mrna': + if pexps.loc[i]['Data type'] == 'mrna' or not did_assembly: reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn" elif pexps.loc[i]['Data type'] == 'dna': reference = f"{output}/Assembly/{pexps.loc[i]['Sample']}/contigs.fasta" @@ -53,38 +54,25 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) - return mg_result, mg_result_norm, mt_result, mt_result_norm -def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple: - mg_result = mg_result_norm = pd.DataFrame(columns=['Contig']) - mt_result = mt_result_norm = pd.DataFrame(columns=['Gene']) - pexps = exps[(exps['Sample'] == sample)] - for i in pexps.index: - if pexps.loc[i]['Data type'] in ['mrna', 'dna']: - reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn" - else: - continue - return mg_result, mg_result_norm, mt_result, mt_result_norm - - def run(): exps = pd.read_csv(snakemake.params.exps, sep='\t') + did_assembly=snakemake.params.did_assembly for sample in set(exps['Sample']): - if snakemake.params.did_assembly: - mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_with_assembly( - exps, snakemake.params.output, sample) - else: - mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_without_assembly( - exps, snakemake.params.output, sample) + mg_result, mg_result_norm, mt_result, mt_result_norm = quantification( + exps, snakemake.params.output, sample, did_assembly=did_assembly) if len(mg_result) > 0: mg_result.to_csv( f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False) - mg_result_norm.to_csv( - f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False) + if did_assembly: + mg_result_norm.to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False) if len(mt_result) > 0: mt_result.to_csv( f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False) - mt_result_norm.astype(int, errors='ignore').to_csv( - f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False) + if did_assembly: + mt_result_norm.astype(int, errors='ignore').to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/summary_report.py b/workflow/scripts/summary_report.py index 2892627..920bf6e 100644 --- a/workflow/scripts/summary_report.py +++ b/workflow/scripts/summary_report.py @@ -48,6 +48,7 @@ def write_versions_report(output): class Reporter: def __init__(self): self.report = pd.DataFrame() + self.sample2name = {} def info_from_preprocessing(self, out_dir): timed_message('Processing results of: preprocessing.') @@ -74,7 +75,7 @@ def info_from_assembly(self, out_dir): sample = file.split('/')[-3] self.report = pd.concat([self.report, pd.Series(name=sample, dtype='object')]) data = pd.read_csv(file, sep='\t', index_col='Assembly') - self.report.loc[sample, ['# contigs', 'N50', 'Reads aligned (%)']] = ( + self.report.loc[self.sample2name[sample], ['# contigs', 'N50', 'Reads aligned (%)']] = ( int(data.loc['# contigs', 'contigs']), int(data.loc['N50', 'contigs']), data.loc['Reads aligned (%)', 'contigs']) @@ -86,7 +87,7 @@ def info_from_binning(self, out_dir): timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}') sample = file.split('/')[-2] data = pd.read_csv(file, sep='\t') - self.report.loc[sample, ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = ( + self.report.loc[self.sample2name[sample], ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = ( ((data['Completeness'] >= 90) & (data['Contamination'] <= 5)).sum(), ((data['Completeness'] >= 50) & (data['Completeness'] < 90) & (data['Contamination'] <= 10)).sum(), ((data['Completeness'] < 50) & (data['Contamination'] <= 10)).sum()) @@ -99,17 +100,17 @@ def info_from_annotation(self, out_dir): for file in fastas: timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}') sample = file.split('/')[-2] - self.report.loc[sample, '# genes'] = count_on_file('>', file) + self.report.loc[self.sample2name[sample], '# genes'] = count_on_file('>', file) for file in upimapi_res: timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}') sample = file.split('/')[-2] - self.report.loc[sample, '# annotations (UPIMAPI)'] = pd.read_csv( - file, sep='\t', low_memory=False)['qseqid'].unique().sum() + self.report.loc[self.sample2name[sample], '# annotations (UPIMAPI)'] = len(pd.read_csv( + file, sep='\t', low_memory=False)['qseqid'].unique()) for file in recognizer_res: timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}') sample = file.split('/')[-2] - self.report.loc[sample, '# annotations (reCOGnizer)'] = pd.read_csv( - file, sep='\t', low_memory=False)['qseqid'].unique().sum() + self.report.loc[self.sample2name[sample], '# annotations (reCOGnizer)'] = len(pd.read_csv( + file, sep='\t', low_memory=False)['qseqid'].unique()) def info_from_quantification(self, out_dir): timed_message('Processing results of: MT quantification.') @@ -149,15 +150,19 @@ def zip_outputs(self, out_dir): def run(self): timed_message('Writting final reports.') write_versions_report(f'{snakemake.params.output}/MOSCA_Versions_Report.xlsx') + exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t') + for sample in exps['Sample'].unique(): + self.sample2name = {sample: exps[exps['Sample'] == sample]['Name'].tolist()} self.info_from_preprocessing(snakemake.params.output) self.info_from_assembly(snakemake.params.output) self.info_from_binning(snakemake.params.output) self.info_from_annotation(snakemake.params.output) self.info_from_quantification(snakemake.params.output) - exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t') self.info_from_differential_expression( snakemake.params.output, cutoff=snakemake.params.cutoff, mp='protein' in exps['Data type'].tolist()) - self.report.to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t') + self.report[['Initial reads', 'Qual trim params', 'Final reads', '# genes', '# annotations (UPIMAPI)', + '# annotations (reCOGnizer)', 'Reads aligned (%)', '# differentially expressed' + ]].to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t') self.zip_outputs(snakemake.params.output)