refactor: Enzeptional update (#234)

* refactor: Cleaned Enzeptional toolbox Signed-off-by: nanayves <yves.g.nana@gmail.com> * ci: try to tie cuda version to scatter * wip: space instead of comma * wip: cuda separate * wip: pin sentence-transformers * ci: reduce disk space --------- Signed-off-by: nanayves <yves.g.nana@gmail.com> Co-authored-by: jannisborn <jab@zurich.ibm.com>
GT4SD · Mar 7, 2024 · 789cbcc · 789cbcc
1 parent 94f0091
commit 789cbcc
Show file tree

Hide file tree

Showing 14 changed files with 1,491 additions and 616 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -35,10 +35,12 @@ jobs:
         run: |
           conda activate gt4sd
           python -m black src/gt4sd --check --diff --color
-      # - name: Check isort
-      #   run: |
-      #     conda activate gt4sd
-      #     python -m isort src/gt4sd --check-only
+      - name: Remove unnecessary files (see https://stackoverflow.com/questions/75536771/github-runner-out-of-disk-space-after-building-docker-image)
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - name: Check flake8
         run: |
           conda activate gt4sd

diff --git a/conda.yml b/conda.yml
@@ -9,7 +9,7 @@ dependencies:
   - pip>=19.1,<20.3
   - pytorch>=1.0,<=1.12.1
   - cpuonly
-  - pytorch-scatter<=2.0.9
+  - pytorch-scatter<=2.0.9=*cu102*
   - pip:
       - -r requirements.txt
       - -r cpu_requirements.txt

diff --git a/conda_gpu.yml b/conda_gpu.yml
@@ -8,7 +8,7 @@ dependencies:
   - python>=3.7,<3.9
   - pip>=19.1,<20.3
   - pytorch>=1.0,<=1.12.1=*cu*
-  - pytorch-scatter<=2.0.9
+  - pytorch-scatter<=2.0.9=*cu102*
   - torchvision<=0.13.1=*cu*
   - torchaudio<=0.12.1=*cu*
   - pip:

diff --git a/examples/enzeptional/README.md b/examples/enzeptional/README.md
@@ -0,0 +1,30 @@
+# Enzyme Optimization in Biocatalytic Reactions
+
+This repository provides an example on how ro run the framework for the optimization of enzymes within the context of biocatalytic reactions.
+
+## Prerequisites
+
+Before initiating the enzyme optimization process, execute the following command in your terminal to activate the environment:
+
+```console
+conda activate gt4sd
+```
+
+## Running the example
+
+To run the example simply type:
+
+```console
+python example_enzeptional.py
+```
+
+## Citation
+
+```bibtex
+@inproceedings{teukam2023enzyme,
+  title={Enzyme optimization via a generative language modeling-based evolutionary algorithm},
+  author={Teukam, Yves Gaetan Nana and Grisoni, Francesca and Manica, Matteo and Zipoli, Federico and Laino, Teodoro},
+  booktitle={American Chemical Society (ACS) Spring Meeting},
+  year={2023}
+}
+```
diff --git a/examples/enzeptional/data.csv b/examples/enzeptional/data.csv
diff --git a/examples/enzeptional/example_enzeptional.py b/examples/enzeptional/example_enzeptional.py
@@ -0,0 +1,84 @@
+import logging
+import pandas as pd
+from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility
+from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer
+from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3
+
+
+def initialize_environment():
+    """Synchronize with GT4SD S3 storage and set up the environment."""
+    # NOTE: For those interested in optimizing kcat values, it is important to adjust the scorer path to reflect this focus, thereby selecting the appropriate model for kcat optimization: f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/kcat/model.pkl". The specification of the scaler, located within the same directory as the `scorer.pkl`, is mandatory for accurate model performance.
+    configuration = GT4SDConfiguration.get_instance()
+    sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties")
+    return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/feasibility/model.pkl"
+
+
+def load_experiment_parameters():
+    """Load experiment parameters from a CSV file."""
+    df = pd.read_csv("data.csv").iloc[1]
+    return df["substrates"], df["products"], df["sequences"], eval(df["intervals"])
+
+
+def setup_optimizer(
+    substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path
+):
+    """Set up and return the optimizer with all necessary components configured."""
+    model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D"
+    chem_paths = "seyonec/ChemBERTa-zinc-base-v1"
+
+    protein_model = HFandTAPEModelUtility(
+        embedding_model_path=model_tokenizer_paths, tokenizer_path=model_tokenizer_paths
+    )
+    mutation_config = {
+        "type": "language-modeling",
+        "embedding_model_path": model_tokenizer_paths,
+        "tokenizer_path": model_tokenizer_paths,
+        "unmasking_model_path": model_tokenizer_paths,
+    }
+
+    mutator = SequenceMutator(sequence=sample_sequence, mutation_config=mutation_config)
+    optimizer_config = {
+        "sequence": sample_sequence,
+        "protein_model": protein_model,
+        "substrate_smiles": substrate_smiles,
+        "product_smiles": product_smiles,
+        "chem_model_path": chem_paths,
+        "chem_tokenizer_path": chem_paths,
+        "scorer_filepath": scorer_path,
+        "mutator": mutator,
+        "intervals": intervals,
+        "batch_size": 5,
+        "top_k": 3,
+        "selection_ratio": 0.25,
+        "perform_crossover": True,
+        "crossover_type": "single_point",
+        "concat_order": ["substrate", "sequence", "product"],
+    }
+    return EnzymeOptimizer(**optimizer_config)
+
+
+def optimize_sequences(optimizer):
+    """Optimize sequences using the configured optimizer."""
+    return optimizer.optimize(
+        num_iterations=3, num_sequences=5, num_mutations=5, time_budget=3600
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    scorer_path = initialize_environment()
+    (
+        substrate_smiles,
+        product_smiles,
+        sample_sequence,
+        intervals,
+    ) = load_experiment_parameters()
+    optimizer = setup_optimizer(
+        substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path
+    )
+    optimized_sequences, iteration_info = optimize_sequences(optimizer)
+    logging.info("Optimization completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -32,6 +32,7 @@ scikit-learn>=1.0.0,<1.3.0
 scikit-optimize>=0.8.1
 scipy>=1.0.0
 sentencepiece>=0.1.95
+sentence_transformers>1.0,<=2.2.2
 sympy>=1.10.1
 tables>=3.7.0
 tape-proteins>=0.4

diff --git a/src/gt4sd/frameworks/enzeptional/__init__.py b/src/gt4sd/frameworks/enzeptional/__init__.py
@@ -1,7 +1,7 @@
 #
 # MIT License
 #
-# Copyright (c) 2022 GT4SD team
+# Copyright (c) 2024 GT4SD team
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -26,4 +26,4 @@
 Module for enzyme optimization.
 """
 
-from .optimization import EnzymeOptimizer  # noqa: F401
+from .core import EnzymeOptimizer  # noqa: F401