Fix when converting counts to int
They all became 0s
Renamed protein_report rule to general_report
Added mosca_logo inclusion on the install.bash script
Renamed the min env yamls
iquasere committed Jan 26, 2024
1 parent 4c707d2 commit 762d2d1
Showing 22 changed files with 128 additions and 107 deletions.
2 changes: 1 addition & 1 deletion cicd/install.bash
Expand Up @@ -51,7 +51,7 @@ echo "Storing MOSCA's files in the Conda environment at: ${mosca_env}"
# create folders for storing MOSCA's YAMLs and scripts
mkdir -p "${mosca_env}/share/MOSCA" "${mosca_env}/bin"
# copy YAMLs and scripts and default values to the MOSCA Conda environment
cp -r MOSCA/workflow/* MOSCA/resources/*.json "${mosca_env}/share/MOSCA"
cp -r -v MOSCA/workflow/* MOSCA/resources/*.json MOSCA/resources/*.txt "${mosca_env}/share/MOSCA"
# make MOSCA's main script executable
chmod +x "${mosca_env}/share/MOSCA/"
# create a symbolic link to MOSCA's main script in the bin folder
1 change: 1 addition & 0 deletions cicd/meta.yaml
Expand Up @@ -21,6 +21,7 @@ requirements:
- python >=3.9, <3.12
- pandas
- snakemake <8
- pyarrow

8 changes: 5 additions & 3 deletions resources/minimum_envs/.README
Expand Up @@ -3,6 +3,8 @@ This folder contains the minimum tools required for each MOSCA environment.
When updating a new environment, the following commands should be run from this directory, assigning the correct value for the `ENV_NAME` variable:
mamba create $ENV_NAME.yml
mamba env export --from-history -f $ENV_NAME.yml > $ENV_NAME.yml
mamba env create $ENV_NAME_min.yaml
conda activate $ENV_NAME
mamba env export --from-history > $ENV_NAME.yaml
For a cross-platform export, but which does not bring full versions of the packages, include the `--from-history` flag.
5 changes: 2 additions & 3 deletions workflow/Snakefile
Expand Up @@ -6,8 +6,7 @@ import sys

##### setup singularity #####
# this container defines the underlying OS for each job when using the workflow
# with --use-conda --use-singularity
# this container defines the underlying OS for each job when using the workflow with --use-conda --use-singularity
container: "docker://continuumio/miniconda3"

##### load rules #####
Expand All @@ -34,7 +33,7 @@ onstart:
print('MOSCA analysis has begun.')

##### target rules #####
##### target rule #####
rule all:
169 changes: 91 additions & 78 deletions workflow/envs/keggcharter.yaml
Expand Up @@ -7,114 +7,127 @@ channels:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- biopython=1.81=py311h2582759_0
- boost-cpp=1.78.0=h6582d0a_3
- brotli=1.0.9=h166bdaf_8
- brotli-bin=1.0.9=h166bdaf_8
- bzip2=1.0.8=h7f98852_4
- c-ares=1.18.1=h7f98852_0
- ca-certificates=2022.12.7=ha878542_0
- cairo=1.16.0=h35add3b_1015
- certifi=2022.12.7=pyhd8ed1ab_0
- biopython=1.83=py312h98912ed_0
- brotli=1.1.0=hd590300_1
- brotli-bin=1.1.0=hd590300_1
- brotli-python=1.1.0=py312h30efb56_1
- bzip2=1.0.8=hd590300_5
- c-ares=1.25.0=hd590300_0
- ca-certificates=2023.11.17=hbcca054_0
- cairo=1.18.0=h3faef2a_0
- certifi=2023.11.17=pyhd8ed1ab_0
- chardet=5.2.0=py312h7900ff3_1
- charset-normalizer=3.3.2=pyhd8ed1ab_0
- colorama=0.4.6=pyhd8ed1ab_0
- contourpy=1.0.7=py311ha3edf6b_0
- cycler=0.11.0=pyhd8ed1ab_0
- contourpy=1.2.0=py312h8572e83_0
- cycler=0.12.1=pyhd8ed1ab_0
- et_xmlfile=1.1.0=pyhd8ed1ab_0
- expat=2.5.0=hcb278e6_1
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- font-ttf-inconsolata=3.000=h77eed37_0
- font-ttf-source-code-pro=2.038=h77eed37_0
- font-ttf-ubuntu=0.83=hab24e00_0
- font-ttf-ubuntu=0.83=h77eed37_1
- fontconfig=2.14.2=h14ed4e7_0
- fonts-conda-ecosystem=1=0
- fonts-conda-forge=1=0
- fonttools=4.39.3=py311h2582759_0
- freetype=2.12.1=hca18f0e_1
- fonttools=4.47.2=py312h98912ed_0
- freetype=2.12.1=h267a509_2
- freetype-py=2.3.0=pyhd8ed1ab_0
- gettext=0.21.1=h27087fc_0
- icu=72.1=hcb278e6_0
- keggcharter=1.0.2
- icu=73.2=h59595ed_0
- idna=3.6=pyhd8ed1ab_0
- keggcharter=1.1.2=hdfd78af_0
- keyutils=1.6.1=h166bdaf_0
- kiwisolver=1.4.4=py311h4dd048b_1
- krb5=1.20.1=h81ceb04_0
- lcms2=2.15=haa2dc70_1
- kiwisolver=1.4.5=py312h8572e83_1
- krb5=1.21.2=h659d440_0
- lcms2=2.16=hb7c19ff_0
- ld_impl_linux-64=2.40=h41732ed_0
- lerc=4.0.0=h27087fc_0
- libblas=3.9.0=16_linux64_openblas
- libbrotlicommon=1.0.9=h166bdaf_8
- libbrotlidec=1.0.9=h166bdaf_8
- libbrotlienc=1.0.9=h166bdaf_8
- libcblas=3.9.0=16_linux64_openblas
- libcurl=8.0.1=h588be90_0
- libdeflate=1.18=h0b41bf4_0
- libblas=3.9.0=21_linux64_openblas
- libbrotlicommon=1.1.0=hd590300_1
- libbrotlidec=1.1.0=hd590300_1
- libbrotlienc=1.1.0=hd590300_1
- libcblas=3.9.0=21_linux64_openblas
- libcurl=8.5.0=hca28451_0
- libdeflate=1.19=hd590300_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libev=4.33=hd590300_2
- libexpat=2.5.0=hcb278e6_1
- libffi=3.4.2=h7f98852_5
- libgcc-ng=12.2.0=h65d4601_19
- libgfortran-ng=12.2.0=h69a702a_19
- libgfortran5=12.2.0=h337968e_19
- libglib=2.76.1=ha491796_0
- libgomp=12.2.0=h65d4601_19
- libiconv=1.17=h166bdaf_0
- libjpeg-turbo=
- liblapack=3.9.0=16_linux64_openblas
- libnghttp2=1.52.0=h61bc06f_0
- libnsl=2.0.0=h7f98852_0
- libopenblas=0.3.21=pthreads_h78a6416_3
- libgcc-ng=13.2.0=h807b86a_3
- libgfortran-ng=13.2.0=h69a702a_3
- libgfortran5=13.2.0=ha4646dd_3
- libglib=2.78.3=h783c2da_0
- libgomp=13.2.0=h807b86a_3
- libiconv=1.17=hd590300_2
- libjpeg-turbo=3.0.0=hd590300_1
- liblapack=3.9.0=21_linux64_openblas
- libnghttp2=1.58.0=h47da74e_1
- libnsl=2.0.1=hd590300_0
- libopenblas=0.3.26=pthreads_h413a1c8_0
- libpng=1.6.39=h753d276_0
- libsqlite=3.40.0=h753d276_0
- libssh2=1.10.0=hf14f497_3
- libstdcxx-ng=12.2.0=h46fd767_19
- libtiff=4.5.0=ha587672_6
- libsqlite=3.44.2=h2797004_0
- libssh2=1.11.0=h0841786_0
- libstdcxx-ng=13.2.0=h7e041cc_3
- libtiff=4.6.0=ha9c0a0a_2
- libuuid=2.38.1=h0b41bf4_0
- libwebp-base=1.3.0=h0b41bf4_0
- libxcb=1.13=h7f98852_1004
- libzlib=1.2.13=h166bdaf_4
- matplotlib-base=3.7.1=py311h8597a09_0
- libwebp-base=1.3.2=hd590300_0
- libxcb=1.15=h0b41bf4_0
- libxcrypt=4.4.36=hd590300_1
- libxml2=2.12.4=h232c23b_1
- libxslt=1.1.39=h76b75d6_0
- libzlib=1.2.13=hd590300_5
- lxml=5.1.0=py312h37b5203_0
- matplotlib-base=3.8.2=py312he5832f3_0
- mscorefonts=0.0.1=3
- munkres=1.1.4=pyh9f0ad1d_0
- ncurses=6.3=h27087fc_1
- ncurses=6.4=h59595ed_2
- nspr=4.35=h27087fc_0
- nss=3.89=he45b914_0
- numpy=1.24.2=py311h8e6699e_0
- openjpeg=2.5.0=hfec8fc6_2
- openpyxl=3.1.1=py311h2582759_0
- openssl=3.1.0=h0b41bf4_0
- packaging=23.1=pyhd8ed1ab_0
- pandas=2.0.0=py311h2872171_0
- pcre2=10.40=hc3806b6_0
- pillow=9.5.0=py311h573f0d3_0
- pip=23.1=pyhd8ed1ab_0
- pixman=0.40.0=h36c2ea0_0
- poppler=23.04.0=hf052cbe_1
- nss=3.97=h1d7d5a4_0
- numpy=1.26.3=py312heda63a1_0
- openjpeg=2.5.0=h488ebb8_3
- openpyxl=3.1.2=py312h98912ed_1
- openssl=3.2.0=hd590300_1
- packaging=23.2=pyhd8ed1ab_0
- pandas=2.2.0=py312hfb8ada1_0
- pcre2=10.42=hcad00b1_0
- pillow=10.2.0=py312hf3581a9_0
- pip=23.3.2=pyhd8ed1ab_0
- pixman=0.43.0=h59595ed_0
- poppler=24.01.0=h590f24d_0
- poppler-data=0.4.12=hd8ed1ab_0
- pthread-stubs=0.4=h36c2ea0_1001
- pyparsing=3.0.9=pyhd8ed1ab_0
- python=3.11.3=h2755cc3_0_cpython
- pycairo=1.25.1=py312he48a392_0
- pyparsing=3.1.1=pyhd8ed1ab_0
- pysocks=1.7.1=pyha2e5f31_6
- python=3.12.1=hab00c5b_1_cpython
- python-dateutil=2.8.2=pyhd8ed1ab_0
- python-tzdata=2023.3=pyhd8ed1ab_0
- python_abi=3.11=3_cp311
- pytz=2023.3=pyhd8ed1ab_0
- python-tzdata=2023.4=pyhd8ed1ab_0
- python_abi=3.12=4_cp312
- pytz=2023.3.post1=pyhd8ed1ab_0
- readline=8.2=h8228510_1
- reportlab=3.6.12=py311h2eb0c47_2
- setuptools=67.7.1=pyhd8ed1ab_0
- reportlab=4.0.9=py312h98912ed_0
- requests=2.31.0=pyhd8ed1ab_0
- rlpycairo=0.2.0=pyhd8ed1ab_0
- setuptools=69.0.3=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- tk=8.6.12=h27826a3_0
- tqdm=4.65.0=pyhd8ed1ab_1
- tzdata=2023c=h71feb2d_0
- wheel=0.40.0=pyhd8ed1ab_0
- tk=8.6.13=noxft_h4845f30_101
- tqdm=4.66.1=pyhd8ed1ab_0
- tzdata=2023d=h0c530f3_0
- urllib3=2.1.0=pyhd8ed1ab_0
- wheel=0.42.0=pyhd8ed1ab_0
- xorg-kbproto=1.0.7=h7f98852_1002
- xorg-libice=1.0.10=h7f98852_0
- xorg-libsm=1.2.3=hd9c2040_1000
- xorg-libx11=1.8.4=h0b41bf4_0
- xorg-libxau=1.0.9=h7f98852_0
- xorg-libice=1.1.1=hd590300_0
- xorg-libsm=1.2.4=h7391055_0
- xorg-libx11=1.8.7=h8ee46fc_0
- xorg-libxau=1.0.11=hd590300_0
- xorg-libxdmcp=1.1.3=h7f98852_0
- xorg-libxext=1.3.4=h0b41bf4_2
- xorg-libxrender=0.9.10=h7f98852_1003
- xorg-libxrender=0.9.11=hd590300_0
- xorg-renderproto=0.11.1=h7f98852_1002
- xorg-xextproto=7.3.0=h0b41bf4_1003
- xorg-xproto=7.0.31=h7f98852_1007
- xz=5.2.6=h166bdaf_0
- zlib=1.2.13=h166bdaf_4
- zstd=1.5.2=h3eb15da_6
prefix: /opt/conda/envs/keggcharter
- zlib=1.2.13=hd590300_5
- zstd=1.5.5=hfc55251_0
prefix: /opt/conda/envs/keggcharter
2 changes: 1 addition & 1 deletion workflow/rules/general_report.smk
rule general_report:
rule protein_report:
rule general_report:
expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
8 changes: 4 additions & 4 deletions workflow/scripts/
@@ -1,5 +1,5 @@
MOSCA's script for producing Protein report
MOSCA's script for producing General report
By João Sequeira
Expand Down Expand Up @@ -57,7 +57,7 @@ def make_general_report(out, exps, sample, mg_preport, mt_preport, mp_preport, d
report = pd.merge(report, spectracounts, on='qseqid', how='left')
mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer')
report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(
report.to_csv(f'{out}/MOSCA_{sample}_General_Report.tsv', sep='\t', index=False)
return report, mg_preport, mt_preport, mp_preport, de_input

Expand All @@ -68,7 +68,7 @@ def make_general_reports(out, exps, max_lines=1000000):
for sample in set(exps['Sample']):
report, mg_report, mt_report, mp_report, de_input = make_general_report(
out, exps, sample, mg_report, mt_report, mp_report, de_input)
timed_message(f'Writing Protein Report for sample: {sample}.')
timed_message(f'Writing General Report for sample: {sample}.')
if len(report) < max_lines:
report.to_excel(writer, sheet_name=sample, index=False)
Expand All @@ -78,7 +78,7 @@ def make_general_reports(out, exps, max_lines=1000000):
report.iloc[i:(i + j)].to_excel(writer, sheet_name=f'{sample} ({k})', index=False)
k += 1
# Write quantification matrices to normalize all together
# Write quantification matrices to normalize all together - these reports have counts from the entire experiment, not just a single "sample"
timed_message('Writing quantification matrices.')
if len(mg_report) > 0:
mg_report[mg_report.columns.tolist()[1:]] = mg_report[mg_report.columns.tolist()[1:]].astype(float)
40 changes: 23 additions & 17 deletions workflow/scripts/
Expand Up @@ -11,7 +11,7 @@
from mosca_tools import perform_alignment, normalize_counts_by_size

def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> None:
def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
Perform quantification of reads with contigs as reference
:param exps: DataFrame with the experiments
Expand All @@ -29,7 +29,7 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
if ',' in pexps.loc[i]['Files']:
reads = [(f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}_{fr}_paired.fq")
reads = [f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}_{fr}_paired.fq"
for fr in ['forward', 'reverse']]
reads = [f"{output}/Preprocess/Trimmomatic/quality_trimmed_{pexps.loc[i]['Name']}.fq"]
Expand All @@ -50,35 +50,41 @@ def quantification_with_assembly(exps: pd.DataFrame, output: str, sample: str) -
mt_result = pd.merge(mt_result, counts, how='outer', on='Gene')
mt_result_norm = pd.merge(mt_result_norm, normalized_counts, how='outer', on='Gene')
if len(mg_result) > 0:
f"{output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
f"{output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
if len(mt_result) > 0:
f"{output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
mt_result_norm.astype(int, errors='ignore').to_csv(
f"{output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)
return mg_result, mg_result_norm, mt_result, mt_result_norm

def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> None:
def quantification_without_assembly(exps: pd.DataFrame, output: str, sample: str) -> tuple:
mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
pexps = exps[(exps['Sample'] == sample)]
for i in pexps.index:

if pexps.loc[i]['Data type'] in ['mrna', 'dna']:
reference = f"{output}/Annotation/{pexps.loc[i]['Sample']}/fgs.ffn"
return mg_result, mg_result_norm, mt_result, mt_result_norm

def run():
exps = pd.read_csv(snakemake.params.exps, sep='\t')

for sample in set(exps['Sample']):
if snakemake.params.did_assembly:
quantification_with_assembly(exps, snakemake.params.output, sample)
mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_with_assembly(
exps, snakemake.params.output, sample)
quantification_without_assembly(exps, snakemake.params.output, sample)
mg_result, mg_result_norm, mt_result, mt_result_norm = quantification_without_assembly(
exps, snakemake.params.output, sample)
if len(mg_result) > 0:
f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
if len(mt_result) > 0:
f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
mt_result_norm.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)

if __name__ == '__main__':
Expand Down

