From 11b19677a9412f1760f23610e7d44d7a490b9ad1 Mon Sep 17 00:00:00 2001 From: iquasere Date: Thu, 28 Dec 2023 14:49:00 +0000 Subject: [PATCH] Renamed Protein report to General report --- meta.yaml | 2 +- workflow/Snakefile | 9 +++++++-- workflow/mosca.py | 10 ++++++++-- workflow/rules/common.smk | 16 ---------------- workflow/rules/entry_report.smk | 2 +- workflow/rules/gene_calling.smk | 11 +++++++---- .../{protein_report.smk => general_report.smk} | 6 +++--- workflow/rules/summary_report.smk | 8 +++++--- .../{protein_report.py => general_report.py} | 12 ++++++------ workflow/scripts/summary_report.py | 6 +++--- 10 files changed, 41 insertions(+), 41 deletions(-) rename workflow/rules/{protein_report.smk => general_report.smk} (89%) rename workflow/scripts/{protein_report.py => general_report.py} (94%) diff --git a/meta.yaml b/meta.yaml index 305c773..5291ba1 100644 --- a/meta.yaml +++ b/meta.yaml @@ -17,7 +17,7 @@ build: requirements: run: - python >=3.9 - - snakemake + - snakemake <8 test: commands: diff --git a/workflow/Snakefile b/workflow/Snakefile index bfcf9d9..6fe3603 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -20,7 +20,7 @@ include: "rules/metaproteomics.smk" include: "rules/quantification.smk" include: "rules/normalization.smk" include: "rules/de_analysis.smk" -include: "rules/protein_report.smk" +include: "rules/general_report.smk" include: "rules/entry_report.smk" include: "rules/keggcharter.smk" include: "rules/summary_report.smk" @@ -28,4 +28,9 @@ include: "rules/summary_report.smk" ##### target rules ##### rule all: input: - all_input + [f"{OUTPUT}/MOSCA_General_Report.xlsx", + f"{OUTPUT}/MOSCA_Entry_Report.xlsx", + f"{OUTPUT}/MOSCA_Versions_Report.xlsx", + f"{OUTPUT}/MOSCA_Summary_Report.tsv", + f"{OUTPUT}/MOSCA_results.zip", + f"{OUTPUT}/KEGG_maps/KEGGCharter_results.tsv"] diff --git a/workflow/mosca.py b/workflow/mosca.py index 2ff3f28..8f12ae2 100644 --- a/workflow/mosca.py +++ b/workflow/mosca.py @@ -25,6 +25,11 @@ args = parser.parse_args() +def validate_config(config_data): + if not config_data['do_assembly'] and config_data['do_binning']: + sys.exit('ERROR: Can only do binning if assembly is performed.') + + def read_config(filename): if filename.split('.')[-1] == 'yaml': with open(filename) as stream: @@ -36,7 +41,7 @@ def read_config(filename): with open(filename) as f: return json.load(f), 'json' else: - exit('Config file must end in either ".json" or ".yaml"') + sys.exit('ERROR: Config file must end in either ".json" or ".yaml"') def save_config(config_data, filename, output_format): @@ -73,8 +78,9 @@ def validate_exps(exps_data): start_time = time() config, config_format = read_config(args.configfile) -pathlib.Path(config["output"]).mkdir(parents=True, exist_ok=True) +validate_config(config) validate_exps(config["experiments"]) +pathlib.Path(config["output"]).mkdir(parents=True, exist_ok=True) save_config(config, f'{config["output"]}/config.json', output_format=config_format) command = ( diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e5374d7..a01ce4b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -56,19 +56,3 @@ def join_reads_input(wildcards): return [f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[i]["Name"]}{fr}.fq' for i in range(len(df)) for fr in (['_forward_paired', '_reverse_paired'] if ',' in df.iloc[i]["Files"] else [''])] - -def fastq2fasta_input(wildcards): - return expand("{output}/Preprocess/Trimmomatic/quality_trimmed_{name}{fr}.fq", output=OUTPUT, - fr=(['_forward_paired', '_reverse_paired'] if EXPS["Files"].str.contains(',').tolist() else ''), - name=wildcards.sample) - -def gene_calling_input(wildcards): - if config['do_assembly']: - return expand("{output}/Assembly/{sample}/scaffolds.fasta", output=OUTPUT, sample=wildcards.sample) - return expand( - "{output}/Preprocess/piled_{name}.fasta", output=OUTPUT, name=wildcards.sample) - -def upimapi_input(wildcards): - if config['do_assembly']: - return expand("{output}/Annotation/{sample}/aligned.blast", output=OUTPUT, sample=set(EXPS['Sample'])) - return expand("{output}/Annotation/{name}/aligned.blast", output=OUTPUT, name=set(EXPS['Name'])) diff --git a/workflow/rules/entry_report.smk b/workflow/rules/entry_report.smk index cffb8b0..3d9c0db 100644 --- a/workflow/rules/entry_report.smk +++ b/workflow/rules/entry_report.smk @@ -1,6 +1,6 @@ rule entry_report: input: - p_reports = expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), + p_reports = expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), norm = f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv" output: f"{OUTPUT}/MOSCA_Entry_Report.xlsx", diff --git a/workflow/rules/gene_calling.smk b/workflow/rules/gene_calling.smk index 9024c5e..e70759b 100644 --- a/workflow/rules/gene_calling.smk +++ b/workflow/rules/gene_calling.smk @@ -1,6 +1,8 @@ rule fastq2fasta: input: - fastq2fasta_input + expand("{output}/Preprocess/Trimmomatic/quality_trimmed_{name}{fr}.fq", output=OUTPUT, + fr=(['_forward_paired', '_reverse_paired'] if EXPS["Files"].str.contains(',').tolist() else ''), + name=lambda wildcards: wildcards.sample) output: f"{OUTPUT}/Preprocess/piled_{{sample}}.fasta" threads: @@ -10,10 +12,11 @@ rule fastq2fasta: rule gene_calling: input: - gene_calling_input + (f"{OUTPUT}/Assembly/{{sample}}/scaffolds.fasta" if config['do_assembly'] else + f"{OUTPUT}/Preprocess/piled_{{sample}}.fasta") output: - expand("{output}/Annotation/{{sample}}/fgs.faa", output=OUTPUT), - expand("{output}/Annotation/{{sample}}/fgs.ffn", output=OUTPUT) + f"{OUTPUT}/Annotation/{{sample}}/fgs.faa", + f"{OUTPUT}/Annotation/{{sample}}/fgs.ffn" threads: config["threads"] params: diff --git a/workflow/rules/protein_report.smk b/workflow/rules/general_report.smk similarity index 89% rename from workflow/rules/protein_report.smk rename to workflow/rules/general_report.smk index fcda315..c78029b 100644 --- a/workflow/rules/protein_report.smk +++ b/workflow/rules/general_report.smk @@ -8,8 +8,8 @@ rule protein_report: expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample'])) output: - expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), - f"{OUTPUT}/MOSCA_Protein_Report.xlsx", + expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), + f"{OUTPUT}/MOSCA_General_Report.xlsx", f"{OUTPUT}/Quantification/dea_input.tsv", f"{OUTPUT}/Quantification/mg_entry_quant.tsv", f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv" @@ -21,4 +21,4 @@ rule protein_report: conda: "../envs/reports.yaml" script: - "../scripts/protein_report.py" + "../scripts/general_report.py" diff --git a/workflow/rules/summary_report.smk b/workflow/rules/summary_report.smk index df9096c..51092db 100644 --- a/workflow/rules/summary_report.smk +++ b/workflow/rules/summary_report.smk @@ -1,11 +1,13 @@ rule summary_report: input: - expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])), + expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])), f"{OUTPUT}/MOSCA_Entry_Report.xlsx", - f"{OUTPUT}/DE_analysis/condition_treated_results.tsv" + f"{OUTPUT}/DE_analysis/condition_treated_results.tsv", + (expand("{output}/Binning/{sample}/checkm.tsv", output=OUTPUT, sample=set(EXPS['Sample'])) + if config['do_binning'] else []) output: f"{OUTPUT}/MOSCA_Versions_Report.xlsx", - f"{OUTPUT}/MOSCA_General_Report.tsv", + f"{OUTPUT}/MOSCA_Summary_Report.tsv", f"{OUTPUT}/MOSCA_results.zip" threads: 1 diff --git a/workflow/scripts/protein_report.py b/workflow/scripts/general_report.py similarity index 94% rename from workflow/scripts/protein_report.py rename to workflow/scripts/general_report.py index 842f8d3..6afa31e 100644 --- a/workflow/scripts/protein_report.py +++ b/workflow/scripts/general_report.py @@ -14,7 +14,7 @@ 'General functional category', 'Functional category', 'Protein description', 'COG ID', 'EC number (reCOGnizer)'] -def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input): +def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input): timed_message(f'Joining data for sample: {sample}.') with open(f'{out}/Annotation/{sample}/fgs.faa') as f: lines = f.readlines() @@ -58,15 +58,15 @@ def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer') report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna( value=0).astype(float).astype(int) - report.to_csv(f'{out}/MOSCA_{sample}_Protein_Report.tsv', sep='\t', index=False) + report.to_csv(f'{out}/MOSCA_{sample}_General_Report.tsv', sep='\t', index=False) return report, mg_preport, mt_preport, mp_preport, de_input -def make_protein_reports(out, exps, max_lines=1000000): +def make_general_reports(out, exps, max_lines=1000000): mg_report = mt_report = mp_report = de_input = pd.DataFrame(columns=['Entry']) - writer = pd.ExcelWriter(f'{out}/MOSCA_Protein_Report.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter(f'{out}/MOSCA_General_Report.xlsx', engine='xlsxwriter') for sample in set(exps['Sample']): - report, mg_report, mt_report, mp_report, de_input = make_protein_report( + report, mg_report, mt_report, mp_report, de_input = make_general_report( out, exps, sample, mg_report, mt_report, mp_report, de_input) timed_message(f'Writing Protein Report for sample: {sample}.') if len(report) < max_lines: @@ -103,7 +103,7 @@ def make_protein_reports(out, exps, max_lines=1000000): def run(): exps = pd.read_csv(snakemake.params.exps, sep='\t') - make_protein_reports(snakemake.params.output, exps) + make_general_reports(snakemake.params.output, exps) if __name__ == '__main__': diff --git a/workflow/scripts/summary_report.py b/workflow/scripts/summary_report.py index d5c50a1..abbfdfa 100644 --- a/workflow/scripts/summary_report.py +++ b/workflow/scripts/summary_report.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -General report construction and export +Summary report construction and export By João Sequeira @@ -139,7 +139,7 @@ def zip_outputs(self, out_dir): 'de_plots': glob(f'{out_dir}/DE_analysis/*.jpeg'), 'kegg_maps': glob(f'{out_dir}/KEGG_maps/*.png'), 'main_reports': [f'{out_dir}/{filename}' for filename in [ - 'MOSCA_Protein_Report.xlsx', 'MOSCA_Entry_Report.xlsx', 'MOSCA_General_Report.tsv']]} + 'MOSCA_Protein_Report.xlsx', 'MOSCA_Entry_Report.xlsx', 'MOSCA_Summary_Report.tsv']]} with ZipFile(f'{out_dir}/MOSCA_results.zip', 'w') as archive: for k, v in files_n_folders.items(): for file in v: @@ -157,7 +157,7 @@ def run(self): exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t') self.info_from_differential_expression( snakemake.params.output, cutoff=snakemake.params.cutoff, mp='protein' in exps['Data type'].tolist()) - self.report.to_csv(f'{snakemake.params.output}/MOSCA_General_Report.tsv', sep='\t') + self.report.to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t') self.zip_outputs(snakemake.params.output)