mozilla · eu9ene · Sep 12, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -16,3 +16,6 @@
 [submodule "3rd_party/preprocess"]
 	path = 3rd_party/preprocess
 	url = https://github.com/kpu/preprocess.git
+[submodule "3rd_party/lumi-marian"]
+	path = 3rd_party/lumi-marian
+	url = git@github.com:hplt-project/lumi-marian.git
diff --git a/3rd_party/lumi-marian b/3rd_party/lumi-marian
diff --git a/3rd_party/marian-dev b/3rd_party/marian-dev
diff --git a/Ftt.def b/Ftt.def
@@ -0,0 +1,156 @@
+Bootstrap: docker
+From: condaforge/mambaforge:22.11.1-4
+Stage: spython-base
+
+%files
+pipeline/setup/install-deps.sh install-deps.sh
+envs/base.yml /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml
+envs/bicleaner-ai-lumi.yml /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml
+envs/bicleaner-ai.yml /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml
+envs/bicleaner.yml /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml
+envs/corpus.yml /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml
+envs/tensorboard.yml /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml
+%labels
+io.github.snakemake.containerized="true"
+io.github.snakemake.conda_env_hash="41592307ee99833c1ad2068c1e915ff9c38acc418b5bebfe7e107d9a79980cb4"
+%post
+
+# Remove this if not in Finland, or change to closer mirror
+cat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp && mv temp /etc/apt/sources.list 
+
+apt-get update && apt-get -y install gcc g++ curl 
+
+export DEBIAN_FRONTEND=noninteractive 
+
+bash install-deps.sh
+
+# Step 1: Retrieve conda environments
+
+# Conda environment:
+#   source: envs/base.yml
+#   prefix: /conda-envs/c7aefb385f47824bd83e494ba6175afb
+#   name: bergamot-training
+#   channels:
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - python=3.9
+#     - cmake=3.21.1
+#     - pip=21.2.2
+#     - pip:
+#       - sacrebleu==2.0.0
+#       - mtdata==0.4.0
+#       - fasttext==0.9.2
+#       - regex==2019.8.19
+#       - sacremoses==0.0.43
+mkdir -p /conda-envs/c7aefb385f47824bd83e494ba6175afb
+
+# Conda environment:
+#   source: envs/bicleaner-ai-lumi.yml
+#   prefix: /conda-envs/6dc32b6f0731acf817ada219622a98b8
+#   name: bicleaner-ai
+#   channels:
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - python=3.9
+#     - pip==21.2.2
+#     - cmake=3.21.1
+#     - pip:
+#       - bicleaner-ai==2.2.1
+#       - tensorflow-rocm==2.10.0.520
+mkdir -p /conda-envs/6dc32b6f0731acf817ada219622a98b8
+
+# Conda environment:
+#   source: envs/bicleaner-ai.yml
+#   prefix: /conda-envs/04b8248cb528961ad452c73dd0a7c8b6
+#   name: bicleaner-ai
+#   channels:
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - python=3.9
+#     - pip==21.2.2
+#     - cmake=3.21.1
+#     - pip:
+#       - bicleaner-ai==2.2.1
+#       - tensorflow==2.6.5
+mkdir -p /conda-envs/04b8248cb528961ad452c73dd0a7c8b6
+
+# Conda environment:
+#   source: envs/bicleaner.yml
+#   prefix: /conda-envs/a4f700aa6ff0256dcd9321b536a081ac
+#   name: bicleaner
+#   channels:
+#     - conda-forge
+#     - bitextor
+#     - defaults
+#   dependencies:
+#     - python=3.8
+#     - pip==23.0
+#     - cmake=3.21.1
+#     - hunspell==1.7.0
+#     - pip:
+#        - pypi-kenlm
+#        - bicleaner==0.16.1
+mkdir -p /conda-envs/a4f700aa6ff0256dcd9321b536a081ac
+
+# Conda environment:
+#   source: envs/corpus.yml
+#   prefix: /conda-envs/2e8e4401e9abbca04941f823d00fe74a
+#   name: corpus
+#   channels:
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - python=3.9
+#     - pip=21.2.2
+#     - pip:
+#       - sacrebleu==2.0.0
+#       - mtdata==0.3.2
+#       - requests==2.26.0
+mkdir -p /conda-envs/2e8e4401e9abbca04941f823d00fe74a
+
+# Conda environment:
+#   source: envs/tensorboard.yml
+#   prefix: /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221
+#   name: tensorboard
+#   channels:
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - python=3.9
+#     - cmake=3.21.1
+#     - pip=21.2.2
+#     - pip:
+#       - tensorboard==2.5.0
+#       - tensorboardX==2.2
+#       - click==8.0.1
+#       - toolz==0.11.1
+mkdir -p /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221
+
+# Step 2: Generate conda environments
+
+mamba env create --prefix /conda-envs/c7aefb385f47824bd83e494ba6175afb --file /conda-envs/c7aefb385f47824bd83e494ba6175afb/environment.yaml && \
+mamba env create --prefix /conda-envs/6dc32b6f0731acf817ada219622a98b8 --file /conda-envs/6dc32b6f0731acf817ada219622a98b8/environment.yaml && \
+mamba env create --prefix /conda-envs/04b8248cb528961ad452c73dd0a7c8b6 --file /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/environment.yaml && \
+mamba env create --prefix /conda-envs/a4f700aa6ff0256dcd9321b536a081ac --file /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/environment.yaml && \
+mamba env create --prefix /conda-envs/2e8e4401e9abbca04941f823d00fe74a --file /conda-envs/2e8e4401e9abbca04941f823d00fe74a/environment.yaml && \
+mamba env create --prefix /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221 --file /conda-envs/fadf1aec392d8a065ae29b9fcf9b3221/environment.yaml && \
+mamba clean --all -y
+
+#Bicleaner needs the fasttext language id model installed
+wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
+cp lid.176.bin /conda-envs/6dc32b6f0731acf817ada219622a98b8/lib/python3.9/site-packages/fastspell/lid.176.bin
+cp lid.176.bin /conda-envs/a4f700aa6ff0256dcd9321b536a081ac/lib/python3.8/site-packages/fastspell/lid.176.bin
+cp lid.176.bin /conda-envs/04b8248cb528961ad452c73dd0a7c8b6/lib/python3.9/site-packages/fastspell/lid.176.bin
+
+#Fastspell (used in bicleaner) uses hunspell to disambiguate between similar languages, install all hunspell dictionaries for that
+wget -O fastspell_dictionaries.tgz https://github.com/mbanon/fastspell/releases/download/dictionaries_v1/fastspell_dictionaries.tgz
+mkdir -p /usr/share/hunspell
+tar -xf fastspell_dictionaries.tgz --directory /usr/share/hunspell  
+
+%runscript
+exec /bin/bash "$@"
+%startscript
+exec /bin/bash "$@"
diff --git a/InstallSnakemakeEnvs b/InstallSnakemakeEnvs
@@ -0,0 +1,14 @@
+from os import listdir
+def get_envs(wildcards):
+   return [x.replace(".yml",".done") for x in os.listdir("envs") if x.endswith(".yml")] 
+
+container: 'Ftt.sif'
+
+rule all:
+    input: get_envs
+
+rule make_envs:
+    conda: 'envs/{env}.yml'
+    output: '{env}.done'
+    shell: f'touch {{output}}'
+
diff --git a/Makefile b/Makefile
@@ -4,12 +4,13 @@
 SHELL=/bin/bash
 
 ### 1. change these settings or override with env variables
-CONFIG=configs/config.prod.yml
+CONFIG=configs/config.opusmt-multimodel-test.yml
 CONDA_PATH=../mambaforge
 SNAKEMAKE_OUTPUT_CACHE=../cache
-PROFILE=local
+#PROFILE=local
 # execution rule or path to rule output, default is all
 TARGET=
+EXTRA=
 REPORTS=../reports
 # for tensorboard
 MODELS=../models
@@ -30,9 +31,24 @@ conda:
 
 snakemake:
 	$(CONDA_ACTIVATE) base
-	mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.12.2 tabulate==0.8.10 --yes
+	mamba create -c conda-forge -c bioconda -n snakemake snakemake==7.19.1 tabulate==0.8.10 --yes
 	mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)"
 
+
+containerize:
+	$(CONDA_ACTIVATE) snakemake
+	$(SNAKEMAKE) \
+	  --profile=profiles/$(PROFILE) \
+	  --configfile $(CONFIG) \
+	  --containerize > Dockerfile
+	spython recipe Dockerfile Ftt.def
+	sed -i "s|%files|%files\npipeline/setup/install-deps.sh install-deps.sh|" Ftt.def
+	sed -i 's#%post#%post\ncat /etc/apt/sources.list | sed "s/archive.ubuntu.com/mirrors.nic.funet.fi/g" > temp \&\& mv temp /etc/apt/sources.list \
+		\napt-get update \&\& apt-get -y install gcc g++ \
+		\nexport DEBIAN_FRONTEND=noninteractive \
+		\nbash install-deps.sh#' Ftt.def
+	apptainer build Ftt.sif Ftt.def
+
 # build container image for cluster and run-local modes (preferred)
 build:
 	sudo singularity build Singularity.sif Singularity.def
@@ -53,7 +69,18 @@ dry-run:
 	  --profile=profiles/$(PROFILE) \
 	  --configfile $(CONFIG) \
 	  -n \
-	  $(TARGET)
+	  $(TARGET) \
+	  $(EXTRA) \
+
+dry-run-hpc:
+	echo "Dry run with config $(CONFIG) and profile $(PROFILE)"
+	$(SNAKEMAKE) \
+	  --profile=profiles/$(PROFILE) \
+	  --configfile $(CONFIG) \
+	  -n \
+	  --conda-base-path=../bin \
+	  $(TARGET) \
+	  $(EXTRA)
 
 test-dry-run: CONFIG=configs/config.test.yml
 test-dry-run: dry-run
@@ -67,8 +94,18 @@ run:
 	$(SNAKEMAKE) \
 	  --profile=profiles/$(PROFILE) \
 	  --configfile $(CONFIG) \
-	  $(TARGET)
+	  $(TARGET) \
+	  $(EXTRA)
 
+run-hpc:
+	echo "Running with config $(CONFIG) and profile $(PROFILE)"
+	chmod +x profiles/$(PROFILE)/*
+	$(SNAKEMAKE) \
+	  --profile=profiles/$(PROFILE) \
+	  --configfile $(CONFIG) \
+	  --conda-base-path=../bin \
+	  $(TARGET) \
+	  $(EXTRA)
 test: CONFIG=configs/config.test.yml
 test: run
 

diff --git a/README.md b/README.md
@@ -1,3 +1,64 @@
+# OPUS-MT integration
+
+This fork makes it possible to use OPUS-MT models as teacher and backward models in the _firefox-translations-training_ pipeline (FTT). Other additions are profiles for running jobs on CSC supercomputers (*puhti*, *lumi* and *mahti*) and code for monitoring the power usage of jobs.
+
+# Workflow changes
+- Added download rule for Tatoeba-Challenge data.
+- Added download rule for OPUS-MT models (tested with Tatoeba-Challenge models, old models might need some changes)
+- Added config parameters for specifying OPUS-MT models as teacher and/or backward model.
+- Added subword segmentation and desegmentation rules.
+
+# Subword segmentation issues
+The biggest incompatibility with OPUS-MT models and FTT is in subword segmentation: default FTT trains models that use the in-built sentencepiece support in Marian, while OPUS-MT models expect data to be pre-segmented. To make it possible to use both the default FTT training and pre-built OPUS-MT models, segmentation and desegmentation steps have been added around marian-specific rules. This causes some clutter, but it's probably the best solution (instead of e.g. doing the segmentation/desegmentation inside the marian scripts), since it also makes it possible to easily implement other subword segmentation methods in the workflow. 
+
+
+# Snakemake and conda on HPC
+FTT is based on Snakemake, which has many benefits in terms of reproducibility and existing support. Among other things, Snakemake supports HPC environments and SLURM out of the box, which should make it ideal for CSC machines. However, Snakemake also makes heavy use of conda, which has been deprecated on CSC machines due to its unsuitability for HPC file systems (https://docs.csc.fi/computing/usage-policy/#conda-installations), and FTT specifically relies on several conda environments. Fortunately, Snakemake has a functionality for containerizing conda environments, so all the conda environments needed by FTT can be provided in an Apptainer container (Ftt.sif).
+
+Containerization does not entirely solve the conda problem, since the Snakemake program itself requires conda to run. CSC provides a snakemake module, but problematically these modules are container-based, and since containers cannot be nested on CSC machines, it is not possible to use containerized conda environments with the CSC snakemake modules. This can be solved by installing Snakemake with pip (this is discouraged in the Snakemake documentation, but I have seen no problems so far).
+
+# Non-containerized software
+FTT uses software that is not included in the containerized conda environments, including several marian installations and other NLP tools. These are automatically built as part of the pipeline. The Ftt.sif container includes the prerequisites for the software components. It's also possible to provide paths to separately built software installations. 
+
+# Getting started on CSC's puhti and mahti
+1. Clone the repository.
+2. Download the Ftt.sif container to the repository root.
+3. Create a virtual Python environment for Snakemake (e.g. in the parent dir of the repository):
+    1. The environment needs to be created with a non-containerized python, as otherwise Apptainer integration will not work. On puhti and mahti, the python executables in /usr/bin/ should work: `/usr/bin/python3.9 -m venv snakemake_env`.
+    2. Activate the virtual environment: `source ./snakemake_env/bin/activate`.
+    3. Install snakemake: `pip install snakemake`.
+4. Install micromamba (e.g. in the parent dir of the repository): `curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba`
+5. Return to the repository directory and update Git submodules: `make git-modules`
+6. Create a _data_ directory (e.g. in the parent dir of the repository) and create a _tmp_ dir in it.
+7. If the data directory is not located in the parent directory of the repository, edit _profiles/slurm-puhti/config.yaml_ or _profiles/slurm-mahti/config.yaml_ and change the bindings in the singularity-args section to point to your data directory, and also enter the _data_ directory path as the _root_ value of the _config_ section.
+8. Edit profiles/slurm-puhti/config.cluster.yaml to change the CSC account to one you have access to. 
+9. Load cuda modules: module load gcc/9.4.0 cuda cudnn
+10. Run pipeline: `make run-hpc PROFILE="slurm-puhti"` or `make run PROFILE="slurm-mahti"`
+
+# Getting started on CSC's lumi
+1. Clone the repository.
+2. Download the Ftt.sif container to the repository root.
+3. Create a virtual Python environment for Snakemake (e.g. in the parent dir of the repository):
+    1. The environment needs to be created with a non-containerized python, as otherwise Apptainer integration will not work. On lumi, use the _cray-python_ module (it is not containerized): `module load cray-python; python -m venv snakemake_env`.
+    2. Activate the virtual environment: `source ./snakemake_env/bin/activate`.
+    3. Install snakemake: `pip install snakemake`.
+4. Install micromamba (e.g. in the parent dir of the repository): `curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba`
+5. Return to the repository directory and update Git submodules: `make git-modules`
+6. Create a _data_ directory (e.g. in the parent dir of the repository) and create a _tmp_ dir in it.
+7. If the data directory is not located in the parent directory of the repository, edit profiles/slurm-lumi/config.yaml and change the bindings in the singularity-args section to point to your data directory, and also enter the _data_ directory path as the _root_ value of the _config_ section.
+8. Edit profiles/slurm-puhti/config.cluster.yaml to change the CSC account to one you have access to. 
+9. Load rocm module: module load rocm.
+10. Copy the marian executables to _3rd_party/lumi-marian/build_ (compiling lumi-marian is currently hacky, so this workaround makes things easier).
+11. Enter _export SINGULARITYENV_LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ to make sure Marian can find all the libraries when it runs containerized.
+12. Run pipeline: `make run-hpc PROFILE="slurm-puhti"`
+
+# Testing
+Since running the whole pipeline for a high-resource language pair will take a long time, there is a test config available for testing that everything works as it should. The test config is used by default, you can change into the full config by modifying the Makefile and changing config.opusmt-test.yml to config.opusmt.yml. You can also provide the config on the command line as the CONFIG parameter with make. Note that even the test config will take a long time if the training corpus is large (since translating the training data will take time). So to do a quick functionality check, pick a language pair with as little data as possible in Tatoeba-Challenge (while still having trained forward and backward models). The default epo-afr is good for quick checking (although note that bicleaner step will be skipped, as there are no bicleaner packs for those languages).
+
+You can test the pipeline without running it by using make dry-run. If you want to build a specific file or rule, you can use the TARGET parameter with make. 
+
+# Original FTT instructions start from here. NOTE: some of the information below no longer applies.
+
 # Firefox Translations training
 Training pipelines for Firefox Translations machine translation models.
 The trained models are hosted in [firefox-translations-models](https://github.com/mozilla/firefox-translations-models/),