Fix when converting counts to int

They all became 0s Renamed protein_report rule to general_report Added mosca_logo inclusion on the install.bash script Renamed the min env yamls
iquasere · Jan 26, 2024 · 762d2d1 · 762d2d1
1 parent 4c707d2
commit 762d2d1
Show file tree

Hide file tree

Showing 22 changed files with 128 additions and 107 deletions.
diff --git a/cicd/install.bash b/cicd/install.bash
@@ -51,7 +51,7 @@ echo "Storing MOSCA's files in the Conda environment at: ${mosca_env}"
 # create folders for storing MOSCA's YAMLs and scripts
 mkdir -p "${mosca_env}/share/MOSCA" "${mosca_env}/bin"
 # copy YAMLs and scripts and default values to the MOSCA Conda environment
-cp -r MOSCA/workflow/* MOSCA/resources/*.json "${mosca_env}/share/MOSCA"
+cp -r -v MOSCA/workflow/* MOSCA/resources/*.json MOSCA/resources/*.txt "${mosca_env}/share/MOSCA"
 # make MOSCA's main script executable
 chmod +x "${mosca_env}/share/MOSCA/mosca.py"
 # create a symbolic link to MOSCA's main script in the bin folder

diff --git a/cicd/meta.yaml b/cicd/meta.yaml
@@ -21,6 +21,7 @@ requirements:
     - python >=3.9, <3.12
     - pandas
     - snakemake <8
+    - pyarrow
 
 test:
   commands:

diff --git a/resources/minimum_envs/.README b/resources/minimum_envs/.README
@@ -3,6 +3,8 @@ This folder contains the minimum tools required for each MOSCA environment.
 When updating a new environment, the following commands should be run from this directory, assigning the correct value for the `ENV_NAME` variable:
 `bash
 ENV_NAME=env_name
-mamba create $ENV_NAME.yml
-mamba env export --from-history -f $ENV_NAME.yml > $ENV_NAME.yml
-`
+mamba env create $ENV_NAME_min.yaml
+conda activate $ENV_NAME
+mamba env export --from-history > $ENV_NAME.yaml
+`
+For a cross-platform export, but which does not bring full versions of the packages, include the `--from-history` flag.
diff --git a/resources/minimum_envs/assembly.yaml → resources/minimum_envs/assembly_min.yaml b/resources/minimum_envs/assembly.yaml → resources/minimum_envs/assembly_min.yaml
diff --git a/resources/minimum_envs/binning.yaml → resources/minimum_envs/binning_min.yaml b/resources/minimum_envs/binning.yaml → resources/minimum_envs/binning_min.yaml
diff --git a/resources/minimum_envs/de_analysis.yaml → resources/minimum_envs/de_analysis_min.yaml b/resources/minimum_envs/de_analysis.yaml → resources/minimum_envs/de_analysis_min.yaml
diff --git a/resources/minimum_envs/gene_calling.yaml → resources/minimum_envs/gene_calling_min.yaml b/resources/minimum_envs/gene_calling.yaml → resources/minimum_envs/gene_calling_min.yaml
diff --git a/resources/minimum_envs/keggcharter.yaml → resources/minimum_envs/keggcharter_min.yaml b/resources/minimum_envs/keggcharter.yaml → resources/minimum_envs/keggcharter_min.yaml
diff --git a/resources/minimum_envs/metaproteomics.yaml → ...rces/minimum_envs/metaproteomics_min.yaml b/resources/minimum_envs/metaproteomics.yaml → ...rces/minimum_envs/metaproteomics_min.yaml
diff --git a/resources/minimum_envs/normalization.yaml → ...urces/minimum_envs/normalization_min.yaml b/resources/minimum_envs/normalization.yaml → ...urces/minimum_envs/normalization_min.yaml
diff --git a/resources/minimum_envs/preprocess.yaml → resources/minimum_envs/preprocess_min.yaml b/resources/minimum_envs/preprocess.yaml → resources/minimum_envs/preprocess_min.yaml
diff --git a/resources/minimum_envs/quantification.yaml → ...rces/minimum_envs/quantification_min.yaml b/resources/minimum_envs/quantification.yaml → ...rces/minimum_envs/quantification_min.yaml
diff --git a/resources/minimum_envs/recognizer.yaml → resources/minimum_envs/recognizer_min.yaml b/resources/minimum_envs/recognizer.yaml → resources/minimum_envs/recognizer_min.yaml
diff --git a/resources/minimum_envs/reports.yaml → resources/minimum_envs/reports_min.yaml b/resources/minimum_envs/reports.yaml → resources/minimum_envs/reports_min.yaml
diff --git a/resources/minimum_envs/seqkit.yaml → resources/minimum_envs/seqkit_min.yaml b/resources/minimum_envs/seqkit.yaml → resources/minimum_envs/seqkit_min.yaml
diff --git a/resources/minimum_envs/summary.yaml → resources/minimum_envs/summary_min.yaml b/resources/minimum_envs/summary.yaml → resources/minimum_envs/summary_min.yaml
diff --git a/resources/minimum_envs/upimapi.yaml → resources/minimum_envs/upimapi_min.yaml b/resources/minimum_envs/upimapi.yaml → resources/minimum_envs/upimapi_min.yaml
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -6,8 +6,7 @@ import sys
 min_version("6.4.1")
 
 ##### setup singularity #####
-# this container defines the underlying OS for each job when using the workflow
-# with --use-conda --use-singularity
+# this container defines the underlying OS for each job when using the workflow with --use-conda --use-singularity
 container: "docker://continuumio/miniconda3"
 
 ##### load rules #####
@@ -34,7 +33,7 @@ onstart:
         print(f.read())
     print('MOSCA analysis has begun.')
 
-##### target rules #####
+##### target rule #####
 rule all:
     input:
         f"{OUTPUT}/MOSCA_General_Report.xlsx",

diff --git a/workflow/envs/keggcharter.yaml b/workflow/envs/keggcharter.yaml
@@ -7,114 +7,127 @@ channels:
 dependencies:
   - _libgcc_mutex=0.1=conda_forge
   - _openmp_mutex=4.5=2_gnu
-  - biopython=1.81=py311h2582759_0
-  - boost-cpp=1.78.0=h6582d0a_3
-  - brotli=1.0.9=h166bdaf_8
-  - brotli-bin=1.0.9=h166bdaf_8
-  - bzip2=1.0.8=h7f98852_4
-  - c-ares=1.18.1=h7f98852_0
-  - ca-certificates=2022.12.7=ha878542_0
-  - cairo=1.16.0=h35add3b_1015
-  - certifi=2022.12.7=pyhd8ed1ab_0
+  - biopython=1.83=py312h98912ed_0
+  - brotli=1.1.0=hd590300_1
+  - brotli-bin=1.1.0=hd590300_1
+  - brotli-python=1.1.0=py312h30efb56_1
+  - bzip2=1.0.8=hd590300_5
+  - c-ares=1.25.0=hd590300_0
+  - ca-certificates=2023.11.17=hbcca054_0
+  - cairo=1.18.0=h3faef2a_0
+  - certifi=2023.11.17=pyhd8ed1ab_0
+  - chardet=5.2.0=py312h7900ff3_1
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
   - colorama=0.4.6=pyhd8ed1ab_0
-  - contourpy=1.0.7=py311ha3edf6b_0
-  - cycler=0.11.0=pyhd8ed1ab_0
+  - contourpy=1.2.0=py312h8572e83_0
+  - cycler=0.12.1=pyhd8ed1ab_0
   - et_xmlfile=1.1.0=pyhd8ed1ab_0
   - expat=2.5.0=hcb278e6_1
   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
   - font-ttf-inconsolata=3.000=h77eed37_0
   - font-ttf-source-code-pro=2.038=h77eed37_0
-  - font-ttf-ubuntu=0.83=hab24e00_0
+  - font-ttf-ubuntu=0.83=h77eed37_1
   - fontconfig=2.14.2=h14ed4e7_0
   - fonts-conda-ecosystem=1=0
   - fonts-conda-forge=1=0
-  - fonttools=4.39.3=py311h2582759_0
-  - freetype=2.12.1=hca18f0e_1
+  - fonttools=4.47.2=py312h98912ed_0
+  - freetype=2.12.1=h267a509_2
+  - freetype-py=2.3.0=pyhd8ed1ab_0
   - gettext=0.21.1=h27087fc_0
-  - icu=72.1=hcb278e6_0
-  - keggcharter=1.0.2
+  - icu=73.2=h59595ed_0
+  - idna=3.6=pyhd8ed1ab_0
+  - keggcharter=1.1.2=hdfd78af_0
   - keyutils=1.6.1=h166bdaf_0
-  - kiwisolver=1.4.4=py311h4dd048b_1
-  - krb5=1.20.1=h81ceb04_0
-  - lcms2=2.15=haa2dc70_1
+  - kiwisolver=1.4.5=py312h8572e83_1
+  - krb5=1.21.2=h659d440_0
+  - lcms2=2.16=hb7c19ff_0
   - ld_impl_linux-64=2.40=h41732ed_0
   - lerc=4.0.0=h27087fc_0
-  - libblas=3.9.0=16_linux64_openblas
-  - libbrotlicommon=1.0.9=h166bdaf_8
-  - libbrotlidec=1.0.9=h166bdaf_8
-  - libbrotlienc=1.0.9=h166bdaf_8
-  - libcblas=3.9.0=16_linux64_openblas
-  - libcurl=8.0.1=h588be90_0
-  - libdeflate=1.18=h0b41bf4_0
+  - libblas=3.9.0=21_linux64_openblas
+  - libbrotlicommon=1.1.0=hd590300_1
+  - libbrotlidec=1.1.0=hd590300_1
+  - libbrotlienc=1.1.0=hd590300_1
+  - libcblas=3.9.0=21_linux64_openblas
+  - libcurl=8.5.0=hca28451_0
+  - libdeflate=1.19=hd590300_0
   - libedit=3.1.20191231=he28a2e2_2
-  - libev=4.33=h516909a_1
+  - libev=4.33=hd590300_2
   - libexpat=2.5.0=hcb278e6_1
   - libffi=3.4.2=h7f98852_5
-  - libgcc-ng=12.2.0=h65d4601_19
-  - libgfortran-ng=12.2.0=h69a702a_19
-  - libgfortran5=12.2.0=h337968e_19
-  - libglib=2.76.1=ha491796_0
-  - libgomp=12.2.0=h65d4601_19
-  - libiconv=1.17=h166bdaf_0
-  - libjpeg-turbo=2.1.5.1=h0b41bf4_0
-  - liblapack=3.9.0=16_linux64_openblas
-  - libnghttp2=1.52.0=h61bc06f_0
-  - libnsl=2.0.0=h7f98852_0
-  - libopenblas=0.3.21=pthreads_h78a6416_3
+  - libgcc-ng=13.2.0=h807b86a_3
+  - libgfortran-ng=13.2.0=h69a702a_3
+  - libgfortran5=13.2.0=ha4646dd_3
+  - libglib=2.78.3=h783c2da_0
+  - libgomp=13.2.0=h807b86a_3
+  - libiconv=1.17=hd590300_2
+  - libjpeg-turbo=3.0.0=hd590300_1
+  - liblapack=3.9.0=21_linux64_openblas
+  - libnghttp2=1.58.0=h47da74e_1
+  - libnsl=2.0.1=hd590300_0
+  - libopenblas=0.3.26=pthreads_h413a1c8_0
   - libpng=1.6.39=h753d276_0
-  - libsqlite=3.40.0=h753d276_0
-  - libssh2=1.10.0=hf14f497_3
-  - libstdcxx-ng=12.2.0=h46fd767_19
-  - libtiff=4.5.0=ha587672_6
+  - libsqlite=3.44.2=h2797004_0
+  - libssh2=1.11.0=h0841786_0
+  - libstdcxx-ng=13.2.0=h7e041cc_3
+  - libtiff=4.6.0=ha9c0a0a_2
   - libuuid=2.38.1=h0b41bf4_0
-  - libwebp-base=1.3.0=h0b41bf4_0
-  - libxcb=1.13=h7f98852_1004
-  - libzlib=1.2.13=h166bdaf_4
-  - matplotlib-base=3.7.1=py311h8597a09_0
+  - libwebp-base=1.3.2=hd590300_0
+  - libxcb=1.15=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libxml2=2.12.4=h232c23b_1
+  - libxslt=1.1.39=h76b75d6_0
+  - libzlib=1.2.13=hd590300_5
+  - lxml=5.1.0=py312h37b5203_0
+  - matplotlib-base=3.8.2=py312he5832f3_0
   - mscorefonts=0.0.1=3
   - munkres=1.1.4=pyh9f0ad1d_0
-  - ncurses=6.3=h27087fc_1
+  - ncurses=6.4=h59595ed_2
   - nspr=4.35=h27087fc_0
-  - nss=3.89=he45b914_0
-  - numpy=1.24.2=py311h8e6699e_0
-  - openjpeg=2.5.0=hfec8fc6_2
-  - openpyxl=3.1.1=py311h2582759_0
-  - openssl=3.1.0=h0b41bf4_0
-  - packaging=23.1=pyhd8ed1ab_0
-  - pandas=2.0.0=py311h2872171_0
-  - pcre2=10.40=hc3806b6_0
-  - pillow=9.5.0=py311h573f0d3_0
-  - pip=23.1=pyhd8ed1ab_0
-  - pixman=0.40.0=h36c2ea0_0
-  - poppler=23.04.0=hf052cbe_1
+  - nss=3.97=h1d7d5a4_0
+  - numpy=1.26.3=py312heda63a1_0
+  - openjpeg=2.5.0=h488ebb8_3
+  - openpyxl=3.1.2=py312h98912ed_1
+  - openssl=3.2.0=hd590300_1
+  - packaging=23.2=pyhd8ed1ab_0
+  - pandas=2.2.0=py312hfb8ada1_0
+  - pcre2=10.42=hcad00b1_0
+  - pillow=10.2.0=py312hf3581a9_0
+  - pip=23.3.2=pyhd8ed1ab_0
+  - pixman=0.43.0=h59595ed_0
+  - poppler=24.01.0=h590f24d_0
   - poppler-data=0.4.12=hd8ed1ab_0
   - pthread-stubs=0.4=h36c2ea0_1001
-  - pyparsing=3.0.9=pyhd8ed1ab_0
-  - python=3.11.3=h2755cc3_0_cpython
+  - pycairo=1.25.1=py312he48a392_0
+  - pyparsing=3.1.1=pyhd8ed1ab_0
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.12.1=hab00c5b_1_cpython
   - python-dateutil=2.8.2=pyhd8ed1ab_0
-  - python-tzdata=2023.3=pyhd8ed1ab_0
-  - python_abi=3.11=3_cp311
-  - pytz=2023.3=pyhd8ed1ab_0
+  - python-tzdata=2023.4=pyhd8ed1ab_0
+  - python_abi=3.12=4_cp312
+  - pytz=2023.3.post1=pyhd8ed1ab_0
   - readline=8.2=h8228510_1
-  - reportlab=3.6.12=py311h2eb0c47_2
-  - setuptools=67.7.1=pyhd8ed1ab_0
+  - reportlab=4.0.9=py312h98912ed_0
+  - requests=2.31.0=pyhd8ed1ab_0
+  - rlpycairo=0.2.0=pyhd8ed1ab_0
+  - setuptools=69.0.3=pyhd8ed1ab_0
   - six=1.16.0=pyh6c4a22f_0
-  - tk=8.6.12=h27826a3_0
-  - tqdm=4.65.0=pyhd8ed1ab_1
-  - tzdata=2023c=h71feb2d_0
-  - wheel=0.40.0=pyhd8ed1ab_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - tqdm=4.66.1=pyhd8ed1ab_0
+  - tzdata=2023d=h0c530f3_0
+  - urllib3=2.1.0=pyhd8ed1ab_0
+  - wheel=0.42.0=pyhd8ed1ab_0
   - xorg-kbproto=1.0.7=h7f98852_1002
-  - xorg-libice=1.0.10=h7f98852_0
-  - xorg-libsm=1.2.3=hd9c2040_1000
-  - xorg-libx11=1.8.4=h0b41bf4_0
-  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libice=1.1.1=hd590300_0
+  - xorg-libsm=1.2.4=h7391055_0
+  - xorg-libx11=1.8.7=h8ee46fc_0
+  - xorg-libxau=1.0.11=hd590300_0
   - xorg-libxdmcp=1.1.3=h7f98852_0
   - xorg-libxext=1.3.4=h0b41bf4_2
-  - xorg-libxrender=0.9.10=h7f98852_1003
+  - xorg-libxrender=0.9.11=hd590300_0
   - xorg-renderproto=0.11.1=h7f98852_1002
   - xorg-xextproto=7.3.0=h0b41bf4_1003
   - xorg-xproto=7.0.31=h7f98852_1007
   - xz=5.2.6=h166bdaf_0
-  - zlib=1.2.13=h166bdaf_4
-  - zstd=1.5.2=h3eb15da_6
-prefix: /opt/conda/envs/keggcharter
+  - zlib=1.2.13=hd590300_5
+  - zstd=1.5.5=hfc55251_0
+prefix: /opt/conda/envs/keggcharter
diff --git a/workflow/rules/general_report.smk b/workflow/rules/general_report.smk
@@ -1,4 +1,4 @@
-rule protein_report:
+rule general_report:
     input:
         expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
         expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),

diff --git a/workflow/scripts/general_report.py b/workflow/scripts/general_report.py
@@ -1,5 +1,5 @@
 """
-MOSCA's script for producing Protein report
+MOSCA's script for producing General report
 
 By João Sequeira
 
@@ -57,7 +57,7 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
         report = pd.merge(report, spectracounts, on='qseqid', how='left')
         mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer')
     report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(
-        value=0).astype(float).astype(int)
+        value=0).astype(float)
     report.to_csv(f'{out}/MOSCA_{sample}_General_Report.tsv', sep='\t', index=False)
     return report, mg_preport, mt_preport, mp_preport, de_input
 
@@ -68,7 +68,7 @@ def make_general_reports(out, exps, max_lines=1000000):
     for sample in set(exps['Sample']):
         report, mg_report, mt_report, mp_report, de_input = make_general_report(
             out, exps, sample, mg_report, mt_report, mp_report, de_input)
-        timed_message(f'Writing Protein Report for sample: {sample}.')
+        timed_message(f'Writing General Report for sample: {sample}.')
         if len(report) < max_lines:
             report.to_excel(writer, sheet_name=sample, index=False)
         else:
@@ -78,7 +78,7 @@ def make_general_reports(out, exps, max_lines=1000000):
                 report.iloc[i:(i + j)].to_excel(writer, sheet_name=f'{sample} ({k})', index=False)
                 k += 1
     writer.close()
-    # Write quantification matrices to normalize all together
+    # Write quantification matrices to normalize all together - these reports have counts from the entire experiment, not just a single "sample"
     timed_message('Writing quantification matrices.')
     if len(mg_report) > 0:
         mg_report[mg_report.columns.tolist()[1:]] = mg_report[mg_report.columns.tolist()[1:]].astype(float)

diff --git a/workflow/scripts/quantification.py b/workflow/scripts/quantification.py
@@ -11,7 +11,7 @@
 from mosca_tools import perform_alignment, normalize_counts_by_size
 
 
-def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> None:
+def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
     """
     Perform quantification of reads with contigs as reference
     :param exps: DataFrame with the experiments
@@ -29,7 +29,7 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
         else:
             continue
         if ',' in pexps.loc[i]['Files']:
-            reads = [(f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}_{fr}_paired.fq")
+            reads = [f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}_{fr}_paired.fq"
                      for fr in ['forward', 'reverse']]
         else:
             reads = [f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}.fq"]
@@ -50,35 +50,41 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
         else:
             mt_result = pd.merge(mt_result, counts, how='outer', on='Gene')
             mt_result_norm = pd.merge(mt_result_norm, normalized_counts, how='outer', on='Gene')
-    if len(mg_result) > 0:
-        mg_result.to_csv(
-            f"{output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
-        mg_result_norm.to_csv(
-            f"{output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
-    if len(mt_result) > 0:
-        mt_result.to_csv(
-            f"{output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
-        mt_result_norm.astype(int, errors='ignore').to_csv(
-            f"{output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
+    return mg_result, mg_result_norm, mt_result, mt_result_norm
 
 
-def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> None:
+def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
     mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
     mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
     pexps = exps[(exps['Sample'] == sample)]
     for i in pexps.index:
-        pass
-
+        if pexps.loc[i]['Data type'] in ['mrna', 'dna']:
+            reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
+        else:
+            continue
+    return mg_result, mg_result_norm, mt_result, mt_result_norm
 
 
 def run():
     exps = pd.read_csv(snakemake.params.exps, sep='\t')
 
     for sample in set(exps['Sample']):
         if snakemake.params.did_assembly:
-            quantification_with_assembly(exps, snakemake.params.output, sample)
+            mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_with_assembly(
+                exps, snakemake.params.output, sample)
         else:
-            quantification_without_assembly(exps, snakemake.params.output, sample)
+            mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_without_assembly(
+                exps, snakemake.params.output, sample)
+        if len(mg_result) > 0:
+            mg_result.to_csv(
+                f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
+            mg_result_norm.to_csv(
+                f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
+        if len(mt_result) > 0:
+            mt_result.to_csv(
+                f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
+            mt_result_norm.astype(int, errors='ignore').to_csv(
+                f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
 
 
 if __name__ == '__main__':
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,7 @@ requirements: @@
         - python >=3.9, <3.12
         - pandas
         - snakemake <8
+        - pyarrow
     test:
       commands:
@@ Expand Down @@