From 11b19677a9412f1760f23610e7d44d7a490b9ad1 Mon Sep 17 00:00:00 2001
From: iquasere <maildosequeira@gmail.com>
Date: Thu, 28 Dec 2023 14:49:00 +0000
Subject: [PATCH] Renamed Protein report to General report

---
 meta.yaml                                        |  2 +-
 workflow/Snakefile                               |  9 +++++++--
 workflow/mosca.py                                | 10 ++++++++--
 workflow/rules/common.smk                        | 16 ----------------
 workflow/rules/entry_report.smk                  |  2 +-
 workflow/rules/gene_calling.smk                  | 11 +++++++----
 .../{protein_report.smk => general_report.smk}   |  6 +++---
 workflow/rules/summary_report.smk                |  8 +++++---
 .../{protein_report.py => general_report.py}     | 12 ++++++------
 workflow/scripts/summary_report.py               |  6 +++---
 10 files changed, 41 insertions(+), 41 deletions(-)
 rename workflow/rules/{protein_report.smk => general_report.smk} (89%)
 rename workflow/scripts/{protein_report.py => general_report.py} (94%)

diff --git a/meta.yaml b/meta.yaml
index 305c773..5291ba1 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -17,7 +17,7 @@ build:
 requirements:
   run:
     - python >=3.9
-    - snakemake
+    - snakemake <8
 
 test:
   commands:
diff --git a/workflow/Snakefile b/workflow/Snakefile
index bfcf9d9..6fe3603 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -20,7 +20,7 @@ include: "rules/metaproteomics.smk"
 include: "rules/quantification.smk"
 include: "rules/normalization.smk"
 include: "rules/de_analysis.smk"
-include: "rules/protein_report.smk"
+include: "rules/general_report.smk"
 include: "rules/entry_report.smk"
 include: "rules/keggcharter.smk"
 include: "rules/summary_report.smk"
@@ -28,4 +28,9 @@ include: "rules/summary_report.smk"
 ##### target rules #####
 rule all:
     input:
-        all_input
+        [f"{OUTPUT}/MOSCA_General_Report.xlsx",
+        f"{OUTPUT}/MOSCA_Entry_Report.xlsx",
+        f"{OUTPUT}/MOSCA_Versions_Report.xlsx",
+        f"{OUTPUT}/MOSCA_Summary_Report.tsv",
+        f"{OUTPUT}/MOSCA_results.zip",
+        f"{OUTPUT}/KEGG_maps/KEGGCharter_results.tsv"]
diff --git a/workflow/mosca.py b/workflow/mosca.py
index 2ff3f28..8f12ae2 100644
--- a/workflow/mosca.py
+++ b/workflow/mosca.py
@@ -25,6 +25,11 @@
 args = parser.parse_args()
 
 
+def validate_config(config_data):
+    if not config_data['do_assembly'] and config_data['do_binning']:
+        sys.exit('ERROR: Can only do binning if assembly is performed.')
+
+
 def read_config(filename):
     if filename.split('.')[-1] == 'yaml':
         with open(filename) as stream:
@@ -36,7 +41,7 @@ def read_config(filename):
         with open(filename) as f:
             return json.load(f), 'json'
     else:
-        exit('Config file must end in either ".json" or ".yaml"')
+        sys.exit('ERROR: Config file must end in either ".json" or ".yaml"')
 
 
 def save_config(config_data, filename, output_format):
@@ -73,8 +78,9 @@ def validate_exps(exps_data):
 
 start_time = time()
 config, config_format = read_config(args.configfile)
-pathlib.Path(config["output"]).mkdir(parents=True, exist_ok=True)
+validate_config(config)
 validate_exps(config["experiments"])
+pathlib.Path(config["output"]).mkdir(parents=True, exist_ok=True)
 save_config(config, f'{config["output"]}/config.json', output_format=config_format)
 
 command = (
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e5374d7..a01ce4b 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -56,19 +56,3 @@ def join_reads_input(wildcards):
     return [f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[i]["Name"]}{fr}.fq'
            for i in range(len(df))
            for fr in (['_forward_paired', '_reverse_paired'] if ',' in df.iloc[i]["Files"] else [''])]
-
-def fastq2fasta_input(wildcards):
-    return expand("{output}/Preprocess/Trimmomatic/quality_trimmed_{name}{fr}.fq", output=OUTPUT,
-        fr=(['_forward_paired', '_reverse_paired'] if EXPS["Files"].str.contains(',').tolist() else ''),
-        name=wildcards.sample)
-
-def gene_calling_input(wildcards):
-    if config['do_assembly']:
-        return expand("{output}/Assembly/{sample}/scaffolds.fasta", output=OUTPUT, sample=wildcards.sample)
-    return expand(
-        "{output}/Preprocess/piled_{name}.fasta", output=OUTPUT, name=wildcards.sample)
-
-def upimapi_input(wildcards):
-    if config['do_assembly']:
-        return expand("{output}/Annotation/{sample}/aligned.blast", output=OUTPUT, sample=set(EXPS['Sample']))
-    return expand("{output}/Annotation/{name}/aligned.blast", output=OUTPUT, name=set(EXPS['Name']))
diff --git a/workflow/rules/entry_report.smk b/workflow/rules/entry_report.smk
index cffb8b0..3d9c0db 100644
--- a/workflow/rules/entry_report.smk
+++ b/workflow/rules/entry_report.smk
@@ -1,6 +1,6 @@
 rule entry_report:
     input:
-        p_reports = expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
+        p_reports = expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
         norm = f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv"
     output:
         f"{OUTPUT}/MOSCA_Entry_Report.xlsx",
diff --git a/workflow/rules/gene_calling.smk b/workflow/rules/gene_calling.smk
index 9024c5e..e70759b 100644
--- a/workflow/rules/gene_calling.smk
+++ b/workflow/rules/gene_calling.smk
@@ -1,6 +1,8 @@
 rule fastq2fasta:
     input:
-        fastq2fasta_input
+        expand("{output}/Preprocess/Trimmomatic/quality_trimmed_{name}{fr}.fq", output=OUTPUT,
+            fr=(['_forward_paired', '_reverse_paired'] if EXPS["Files"].str.contains(',').tolist() else ''),
+            name=lambda wildcards: wildcards.sample)
     output:
         f"{OUTPUT}/Preprocess/piled_{{sample}}.fasta"
     threads:
@@ -10,10 +12,11 @@ rule fastq2fasta:
 
 rule gene_calling:
     input:
-        gene_calling_input
+        (f"{OUTPUT}/Assembly/{{sample}}/scaffolds.fasta" if config['do_assembly'] else
+        f"{OUTPUT}/Preprocess/piled_{{sample}}.fasta")
     output:
-        expand("{output}/Annotation/{{sample}}/fgs.faa", output=OUTPUT),
-        expand("{output}/Annotation/{{sample}}/fgs.ffn", output=OUTPUT)
+        f"{OUTPUT}/Annotation/{{sample}}/fgs.faa",
+        f"{OUTPUT}/Annotation/{{sample}}/fgs.ffn"
     threads:
         config["threads"]
     params:
diff --git a/workflow/rules/protein_report.smk b/workflow/rules/general_report.smk
similarity index 89%
rename from workflow/rules/protein_report.smk
rename to workflow/rules/general_report.smk
index fcda315..c78029b 100644
--- a/workflow/rules/protein_report.smk
+++ b/workflow/rules/general_report.smk
@@ -8,8 +8,8 @@ rule protein_report:
         expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
         expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample']))
     output:
-        expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
-        f"{OUTPUT}/MOSCA_Protein_Report.xlsx",
+        expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
+        f"{OUTPUT}/MOSCA_General_Report.xlsx",
         f"{OUTPUT}/Quantification/dea_input.tsv",
         f"{OUTPUT}/Quantification/mg_entry_quant.tsv",
         f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv"
@@ -21,4 +21,4 @@ rule protein_report:
     conda:
         "../envs/reports.yaml"
     script:
-        "../scripts/protein_report.py"
+        "../scripts/general_report.py"
diff --git a/workflow/rules/summary_report.smk b/workflow/rules/summary_report.smk
index df9096c..51092db 100644
--- a/workflow/rules/summary_report.smk
+++ b/workflow/rules/summary_report.smk
@@ -1,11 +1,13 @@
 rule summary_report:
     input:
-        expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
+        expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
         f"{OUTPUT}/MOSCA_Entry_Report.xlsx",
-        f"{OUTPUT}/DE_analysis/condition_treated_results.tsv"
+        f"{OUTPUT}/DE_analysis/condition_treated_results.tsv",
+        (expand("{output}/Binning/{sample}/checkm.tsv", output=OUTPUT, sample=set(EXPS['Sample']))
+         if config['do_binning'] else [])
     output:
         f"{OUTPUT}/MOSCA_Versions_Report.xlsx",
-        f"{OUTPUT}/MOSCA_General_Report.tsv",
+        f"{OUTPUT}/MOSCA_Summary_Report.tsv",
         f"{OUTPUT}/MOSCA_results.zip"
     threads:
         1
diff --git a/workflow/scripts/protein_report.py b/workflow/scripts/general_report.py
similarity index 94%
rename from workflow/scripts/protein_report.py
rename to workflow/scripts/general_report.py
index 842f8d3..6afa31e 100644
--- a/workflow/scripts/protein_report.py
+++ b/workflow/scripts/general_report.py
@@ -14,7 +14,7 @@
     'General functional category', 'Functional category', 'Protein description', 'COG ID', 'EC number (reCOGnizer)']
 
 
-def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input):
+def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input):
     timed_message(f'Joining data for sample: {sample}.')
     with open(f'{out}/Annotation/{sample}/fgs.faa') as f:
         lines = f.readlines()
@@ -58,15 +58,15 @@ def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
         mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer')
     report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(
         value=0).astype(float).astype(int)
-    report.to_csv(f'{out}/MOSCA_{sample}_Protein_Report.tsv', sep='\t', index=False)
+    report.to_csv(f'{out}/MOSCA_{sample}_General_Report.tsv', sep='\t', index=False)
     return report, mg_preport, mt_preport, mp_preport, de_input
 
 
-def make_protein_reports(out, exps, max_lines=1000000):
+def make_general_reports(out, exps, max_lines=1000000):
     mg_report = mt_report = mp_report = de_input = pd.DataFrame(columns=['Entry'])
-    writer = pd.ExcelWriter(f'{out}/MOSCA_Protein_Report.xlsx', engine='xlsxwriter')
+    writer = pd.ExcelWriter(f'{out}/MOSCA_General_Report.xlsx', engine='xlsxwriter')
     for sample in set(exps['Sample']):
-        report, mg_report, mt_report, mp_report, de_input = make_protein_report(
+        report, mg_report, mt_report, mp_report, de_input = make_general_report(
             out, exps, sample, mg_report, mt_report, mp_report, de_input)
         timed_message(f'Writing Protein Report for sample: {sample}.')
         if len(report) < max_lines:
@@ -103,7 +103,7 @@ def make_protein_reports(out, exps, max_lines=1000000):
 
 def run():
     exps = pd.read_csv(snakemake.params.exps, sep='\t')
-    make_protein_reports(snakemake.params.output, exps)
+    make_general_reports(snakemake.params.output, exps)
 
 
 if __name__ == '__main__':
diff --git a/workflow/scripts/summary_report.py b/workflow/scripts/summary_report.py
index d5c50a1..abbfdfa 100644
--- a/workflow/scripts/summary_report.py
+++ b/workflow/scripts/summary_report.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-General report construction and export
+Summary report construction and export
 
 By João Sequeira
 
@@ -139,7 +139,7 @@ def zip_outputs(self, out_dir):
             'de_plots': glob(f'{out_dir}/DE_analysis/*.jpeg'),
             'kegg_maps': glob(f'{out_dir}/KEGG_maps/*.png'),
             'main_reports': [f'{out_dir}/{filename}' for filename in [
-                'MOSCA_Protein_Report.xlsx', 'MOSCA_Entry_Report.xlsx', 'MOSCA_General_Report.tsv']]}
+                'MOSCA_Protein_Report.xlsx', 'MOSCA_Entry_Report.xlsx', 'MOSCA_Summary_Report.tsv']]}
         with ZipFile(f'{out_dir}/MOSCA_results.zip', 'w') as archive:
             for k, v in files_n_folders.items():
                 for file in v:
@@ -157,7 +157,7 @@ def run(self):
         exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t')
         self.info_from_differential_expression(
             snakemake.params.output, cutoff=snakemake.params.cutoff, mp='protein' in exps['Data type'].tolist())
-        self.report.to_csv(f'{snakemake.params.output}/MOSCA_General_Report.tsv', sep='\t')
+        self.report.to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t')
         self.zip_outputs(snakemake.params.output)