Added merging of paired-end reads

before gene calling Also several fixes in Summary Report: consider "Samples" in the rows of "Names", and count properly number of sequences annotated
iquasere · Jan 29, 2024 · 0a94e03 · 0a94e03
1 parent 762d2d1
commit 0a94e03
Show file tree

Hide file tree

Showing 10 changed files with 98 additions and 48 deletions.
diff --git a/resources/minimum_envs/merge_reads_min.yaml b/resources/minimum_envs/merge_reads_min.yaml
@@ -0,0 +1,8 @@
+name: merge_reads
+channels:
+  - conda-forge
+  - bioconda
+  - anaconda
+  - defaults
+dependencies:
+  - pear
diff --git a/workflow/envs/merge_reads.yaml b/workflow/envs/merge_reads.yaml
@@ -0,0 +1,16 @@
+name: merge_reads
+channels:
+  - conda-forge
+  - bioconda
+  - anaconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=hd590300_5
+  - libgcc-ng=13.2.0=h807b86a_4
+  - libgomp=13.2.0=h807b86a_4
+  - libzlib=1.2.13=hd590300_5
+  - pear=0.9.6=h9d449c0_10
+  - zlib=1.2.13=hd590300_5
+prefix: /opt/conda/envs/merge_reads
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -50,3 +50,14 @@ def sample_to_reads(wildcards):
     df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index()
     return [f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}{fr}.fq' for row in range(len(df))
            for fr in (['_forward_paired', '_reverse_paired'] if ',' in df.iloc[row]["Files"] else [''])]
+
+
+def gene_calling_input(wildcards):
+    df = mg_exps[mg_exps['Sample'] == wildcards.sample].reset_index()
+    result = []
+    for row in range(len(df)):
+        if ',' in df.iloc[row]["Files"]:
+            result.append(f'{OUTPUT}/Preprocess/Trimmomatic/{df.iloc[row]["Name"]}.assembled.fastq')
+        else:
+            result.append(f'{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{df.iloc[row]["Name"]}.fq')
+    return result
diff --git a/workflow/rules/gene_calling.smk b/workflow/rules/gene_calling.smk
@@ -1,6 +1,22 @@
+rule merge_reads:
+    input:
+        forward_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_forward_paired.fq",
+        reverse_reads=f"{OUTPUT}/Preprocess/Trimmomatic/quality_trimmed_{{name}}_reverse_paired.fq"
+    output:
+        expand("{output}/Preprocess/Trimmomatic/{{name}}.assembled.fastq", output=OUTPUT)
+    threads:
+        config["threads"]
+    params:
+        memory = config["max_memory"]
+    conda:
+        "../envs/merge_reads.yaml"
+    shell:
+        "pear -f {input.forward_reads} -r {input.reverse_reads} -o {OUTPUT}/Preprocess/Trimmomatic/{wildcards.name} "
+        "-j {threads} -y {params.memory}G"
+
 rule fastq2fasta:
     input:
-        sample_to_reads
+        gene_calling_input
     output:
         expand("{output}/Preprocess/piled_{{sample}}.fasta", output=OUTPUT)
     threads:

diff --git a/workflow/rules/general_report.smk b/workflow/rules/general_report.smk
@@ -3,9 +3,9 @@ rule general_report:
         expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
         expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
         expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [],
-        expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [],
+        expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [],
         expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [],
-        expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else [],
+        expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else [],
         expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample']))
     output:
         expand("{output}/MOSCA_{sample}_General_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
@@ -19,6 +19,7 @@ rule general_report:
     params:
         output = OUTPUT,
         exps = f"{OUTPUT}/exps.tsv",
+        did_assembly = config['do_assembly']
     conda:
         "../envs/reports.yaml"
     script:

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -7,9 +7,9 @@ rule quantification:
     output:
         expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])),
         expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])) if len(mg_exps) > 0 else [],
-        expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 or not config['assembly']) else [],
+        expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mg_exps) > 0 and config['do_assembly']) else [],
         expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])) if len(mt_exps) > 0 else [],
-        expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 or not config['assembly']) else []
+        expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) if (len(mt_exps) > 0 and config['do_assembly']) else []
     threads:
         config["threads"]
     params:

diff --git a/workflow/scripts/general_report.py b/workflow/scripts/general_report.py
@@ -14,7 +14,7 @@
     'General functional category', 'Functional category', 'Protein description', 'COG ID', 'EC number (reCOGnizer)']
 
 
-def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input):
+def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, de_input, did_assembly=True):
     timed_message(f'Joining data for sample: {sample}.')
     with open(f'{out}/Annotation/{sample}/fgs.faa') as f:
         lines = f.readlines()
@@ -32,18 +32,23 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
     report = report.rename(columns={
         **{f'{col}_x': f'{col} (UPIMAPI)' for col in rename_cols},
         **{f'{col}_y': f'{col} (reCOGnizer)' for col in rename_cols}})
-    report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1])
+    if did_assembly:
+        report['Contig'] = report['qseqid'].apply(lambda x: x.split('_')[1])
     mg_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'dna')]['Name'].tolist()
     mt_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].tolist()
     mp_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].tolist()
     if len(mg_names) > 0:
-        mg_counts = pd.read_csv(f'{out}/Quantification/{sample}_mg_norm.tsv', sep='\t')
-        mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1])
-        report = pd.merge(report, mg_counts, on='Contig', how='left')
+        mg_counts = pd.read_csv((f'{out}/Quantification/{sample}_mg_norm.tsv' if did_assembly else
+                                 f'{out}/Quantification/{sample}_mg.readcounts'), sep='\t', names=[
+                                'qseqid'] + mg_names, skiprows=1)
+        if did_assembly:
+            mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1])
+        report = pd.merge(report, mg_counts, on=('Contig' if did_assembly else 'qseqid'), how='left')
         mg_preport = pd.merge(mg_preport, report[['Entry'] + mg_names], on='Entry', how='outer')
     if len(mt_names) > 0:
         report = pd.merge(
-            report, pd.read_csv(f'{out}/Quantification/{sample}_mt_norm.tsv', sep='\t', names=[
+            report, pd.read_csv((f'{out}/Quantification/{sample}_mt_norm.tsv' if did_assembly else
+                                 f'{out}/Quantification/{sample}_mt.readcounts'), sep='\t', names=[
                 'qseqid'] + mt_names), on='qseqid', how='left')
         mt_preport = pd.merge(mt_preport, report[['Entry'] + mt_names], on='Entry', how='outer')
         # MT readcounts come normalized by gene size, and therefore not fit for DE (as they are floats).
@@ -62,12 +67,12 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
     return report, mg_preport, mt_preport, mp_preport, de_input
 
 
-def make_general_reports(out, exps, max_lines=1000000):
+def make_general_reports(out, exps, max_lines=1000000, did_assembly=True):
     mg_report = mt_report = mp_report = de_input = pd.DataFrame(columns=['Entry'])
     writer = pd.ExcelWriter(f'{out}/MOSCA_General_Report.xlsx', engine='xlsxwriter')
     for sample in set(exps['Sample']):
         report, mg_report, mt_report, mp_report, de_input = make_general_report(
-            out, exps, sample, mg_report, mt_report, mp_report, de_input)
+            out, exps, sample, mg_report, mt_report, mp_report, de_input, did_assembly=did_assembly)
         timed_message(f'Writing General Report for sample: {sample}.')
         if len(report) < max_lines:
             report.to_excel(writer, sheet_name=sample, index=False)
@@ -103,7 +108,7 @@ def make_general_reports(out, exps, max_lines=1000000):
 
 def run():
     exps = pd.read_csv(snakemake.params.exps, sep='\t')
-    make_general_reports(snakemake.params.output, exps)
+    make_general_reports(snakemake.params.output, exps, did_assembly=snakemake.params.did_assembly)
 
 
 if __name__ == '__main__':

diff --git a/workflow/scripts/mosca_tools.py b/workflow/scripts/mosca_tools.py
@@ -143,7 +143,7 @@ def perform_alignment(reference, reads, basename, threads=1):
         print(f'{basename}.log was found!')
     run_pipe_command(
         f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | \
-awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
+        awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
 
 
 def fastq2fasta(fastq, output):

diff --git a/workflow/scripts/quantification.py b/workflow/scripts/quantification.py
@@ -11,18 +11,19 @@
 from mosca_tools import perform_alignment, normalize_counts_by_size
 
 
-def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
+def quantification(exps: pd.DataFrame, output: str, sample: str, did_assembly: bool = True) -> tuple:
     """
     Perform quantification of reads with contigs as reference
     :param exps: DataFrame with the experiments
     :param output: Output directory
     :param sample: Sample name
+    :param did_assembly: Whether assembly was performed or not
     """
     mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
     mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
     pexps = exps[(exps['Sample'] == sample)]
     for i in pexps.index:
-        if pexps.loc[i]['Data type'] == 'mrna':
+        if pexps.loc[i]['Data type'] == 'mrna' or not did_assembly:
             reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
         elif pexps.loc[i]['Data type'] == 'dna':
             reference = f"{output}/Assembly/{pexps.loc[i]['Sample']}/contigs.fasta"
@@ -53,38 +54,25 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
     return mg_result, mg_result_norm, mt_result, mt_result_norm
 
 
-def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
-    mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
-    mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
-    pexps = exps[(exps['Sample'] == sample)]
-    for i in pexps.index:
-        if pexps.loc[i]['Data type'] in ['mrna', 'dna']:
-            reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
-        else:
-            continue
-    return mg_result, mg_result_norm, mt_result, mt_result_norm
-
-
 def run():
     exps = pd.read_csv(snakemake.params.exps, sep='\t')
+    did_assembly=snakemake.params.did_assembly
 
     for sample in set(exps['Sample']):
-        if snakemake.params.did_assembly:
-            mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_with_assembly(
-                exps, snakemake.params.output, sample)
-        else:
-            mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_without_assembly(
-                exps, snakemake.params.output, sample)
+        mg_result, mg_result_norm, mt_result, mt_result_norm = quantification(
+                exps, snakemake.params.output, sample, did_assembly=did_assembly)
         if len(mg_result) > 0:
             mg_result.to_csv(
                 f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
-            mg_result_norm.to_csv(
-                f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
+            if did_assembly:
+                mg_result_norm.to_csv(
+                    f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
         if len(mt_result) > 0:
             mt_result.to_csv(
                 f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
-            mt_result_norm.astype(int, errors='ignore').to_csv(
-                f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
+            if did_assembly:
+                mt_result_norm.astype(int, errors='ignore').to_csv(
+                    f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
 
 
 if __name__ == '__main__':

diff --git a/workflow/scripts/summary_report.py b/workflow/scripts/summary_report.py
@@ -48,6 +48,7 @@ def write_versions_report(output):
 class Reporter:
     def __init__(self):
         self.report = pd.DataFrame()
+        self.sample2name = {}
 
     def info_from_preprocessing(self, out_dir):
         timed_message('Processing results of: preprocessing.')
@@ -74,7 +75,7 @@ def info_from_assembly(self, out_dir):
             sample = file.split('/')[-3]
             self.report = pd.concat([self.report, pd.Series(name=sample, dtype='object')])
             data = pd.read_csv(file, sep='\t', index_col='Assembly')
-            self.report.loc[sample, ['# contigs', 'N50', 'Reads aligned (%)']] = (
+            self.report.loc[self.sample2name[sample], ['# contigs', 'N50', 'Reads aligned (%)']] = (
                 int(data.loc['# contigs', 'contigs']),
                 int(data.loc['N50', 'contigs']),
                 data.loc['Reads aligned (%)', 'contigs'])
@@ -86,7 +87,7 @@ def info_from_binning(self, out_dir):
             timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
             sample = file.split('/')[-2]
             data = pd.read_csv(file, sep='\t')
-            self.report.loc[sample, ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = (
+            self.report.loc[self.sample2name[sample], ['# high-qual MAGs', '# medium-qual MAGs', '# low-qual MAGs']] = (
                 ((data['Completeness'] >= 90) & (data['Contamination'] <= 5)).sum(),
                 ((data['Completeness'] >= 50) & (data['Completeness'] < 90) & (data['Contamination'] <= 10)).sum(),
                 ((data['Completeness'] < 50) & (data['Contamination'] <= 10)).sum())
@@ -99,17 +100,17 @@ def info_from_annotation(self, out_dir):
         for file in fastas:
             timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
             sample = file.split('/')[-2]
-            self.report.loc[sample, '# genes'] = count_on_file('>', file)
+            self.report.loc[self.sample2name[sample], '# genes'] = count_on_file('>', file)
         for file in upimapi_res:
             timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
             sample = file.split('/')[-2]
-            self.report.loc[sample, '# annotations (UPIMAPI)'] = pd.read_csv(
-                file, sep='\t', low_memory=False)['qseqid'].unique().sum()
+            self.report.loc[self.sample2name[sample], '# annotations (UPIMAPI)'] = len(pd.read_csv(
+                file, sep='\t', low_memory=False)['qseqid'].unique())
         for file in recognizer_res:
             timed_message(f'Obtaining info from: {"/".join(file.split("/")[-3:])}')
             sample = file.split('/')[-2]
-            self.report.loc[sample, '# annotations (reCOGnizer)'] = pd.read_csv(
-                file, sep='\t', low_memory=False)['qseqid'].unique().sum()
+            self.report.loc[self.sample2name[sample], '# annotations (reCOGnizer)'] = len(pd.read_csv(
+                file, sep='\t', low_memory=False)['qseqid'].unique())
 
     def info_from_quantification(self, out_dir):
         timed_message('Processing results of: MT quantification.')
@@ -149,15 +150,19 @@ def zip_outputs(self, out_dir):
     def run(self):
         timed_message('Writting final reports.')
         write_versions_report(f'{snakemake.params.output}/MOSCA_Versions_Report.xlsx')
+        exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t')
+        for sample in exps['Sample'].unique():
+            self.sample2name = {sample: exps[exps['Sample'] == sample]['Name'].tolist()}
         self.info_from_preprocessing(snakemake.params.output)
         self.info_from_assembly(snakemake.params.output)
         self.info_from_binning(snakemake.params.output)
         self.info_from_annotation(snakemake.params.output)
         self.info_from_quantification(snakemake.params.output)
-        exps = pd.read_csv(f'{snakemake.params.output}/exps.tsv', sep='\t')
         self.info_from_differential_expression(
             snakemake.params.output, cutoff=snakemake.params.cutoff, mp='protein' in exps['Data type'].tolist())
-        self.report.to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t')
+        self.report[['Initial reads', 'Qual trim params', 'Final reads', '# genes', '# annotations (UPIMAPI)',
+                     '# annotations (reCOGnizer)', 'Reads aligned (%)', '# differentially expressed'
+                     ]].to_csv(f'{snakemake.params.output}/MOSCA_Summary_Report.tsv', sep='\t')
         self.zip_outputs(snakemake.params.output)