Skip to content

Commit

Permalink
refactor: Enzeptional update (#234)
Browse files Browse the repository at this point in the history
* refactor: Cleaned Enzeptional toolbox

Signed-off-by: nanayves <yves.g.nana@gmail.com>

* ci: try to tie cuda version to scatter

* wip: space instead of comma

* wip: cuda separate

* wip: pin sentence-transformers

* ci: reduce disk space

---------

Signed-off-by: nanayves <yves.g.nana@gmail.com>
Co-authored-by: jannisborn <jab@zurich.ibm.com>
  • Loading branch information
yvesnana and jannisborn authored Mar 7, 2024
1 parent 94f0091 commit 789cbcc
Show file tree
Hide file tree
Showing 14 changed files with 1,491 additions and 616 deletions.
10 changes: 6 additions & 4 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ jobs:
run: |
conda activate gt4sd
python -m black src/gt4sd --check --diff --color
# - name: Check isort
# run: |
# conda activate gt4sd
# python -m isort src/gt4sd --check-only
- name: Remove unnecessary files (see https://stackoverflow.com/questions/75536771/github-runner-out-of-disk-space-after-building-docker-image)
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Check flake8
run: |
conda activate gt4sd
Expand Down
2 changes: 1 addition & 1 deletion conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- pip>=19.1,<20.3
- pytorch>=1.0,<=1.12.1
- cpuonly
- pytorch-scatter<=2.0.9
- pytorch-scatter<=2.0.9=*cu102*
- pip:
- -r requirements.txt
- -r cpu_requirements.txt
Expand Down
2 changes: 1 addition & 1 deletion conda_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dependencies:
- python>=3.7,<3.9
- pip>=19.1,<20.3
- pytorch>=1.0,<=1.12.1=*cu*
- pytorch-scatter<=2.0.9
- pytorch-scatter<=2.0.9=*cu102*
- torchvision<=0.13.1=*cu*
- torchaudio<=0.12.1=*cu*
- pip:
Expand Down
30 changes: 30 additions & 0 deletions examples/enzeptional/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Enzyme Optimization in Biocatalytic Reactions

This repository provides an example on how ro run the framework for the optimization of enzymes within the context of biocatalytic reactions.

## Prerequisites

Before initiating the enzyme optimization process, execute the following command in your terminal to activate the environment:

```console
conda activate gt4sd
```

## Running the example

To run the example simply type:

```console
python example_enzeptional.py
```

## Citation

```bibtex
@inproceedings{teukam2023enzyme,
title={Enzyme optimization via a generative language modeling-based evolutionary algorithm},
author={Teukam, Yves Gaetan Nana and Grisoni, Francesca and Manica, Matteo and Zipoli, Federico and Laino, Teodoro},
booktitle={American Chemical Society (ACS) Spring Meeting},
year={2023}
}
```
106 changes: 106 additions & 0 deletions examples/enzeptional/data.csv

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions examples/enzeptional/example_enzeptional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging
import pandas as pd
from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility
from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer
from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3


def initialize_environment():
"""Synchronize with GT4SD S3 storage and set up the environment."""
# NOTE: For those interested in optimizing kcat values, it is important to adjust the scorer path to reflect this focus, thereby selecting the appropriate model for kcat optimization: f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/kcat/model.pkl". The specification of the scaler, located within the same directory as the `scorer.pkl`, is mandatory for accurate model performance.
configuration = GT4SDConfiguration.get_instance()
sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties")
return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/feasibility/model.pkl"


def load_experiment_parameters():
"""Load experiment parameters from a CSV file."""
df = pd.read_csv("data.csv").iloc[1]
return df["substrates"], df["products"], df["sequences"], eval(df["intervals"])


def setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path
):
"""Set up and return the optimizer with all necessary components configured."""
model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D"
chem_paths = "seyonec/ChemBERTa-zinc-base-v1"

protein_model = HFandTAPEModelUtility(
embedding_model_path=model_tokenizer_paths, tokenizer_path=model_tokenizer_paths
)
mutation_config = {
"type": "language-modeling",
"embedding_model_path": model_tokenizer_paths,
"tokenizer_path": model_tokenizer_paths,
"unmasking_model_path": model_tokenizer_paths,
}

mutator = SequenceMutator(sequence=sample_sequence, mutation_config=mutation_config)
optimizer_config = {
"sequence": sample_sequence,
"protein_model": protein_model,
"substrate_smiles": substrate_smiles,
"product_smiles": product_smiles,
"chem_model_path": chem_paths,
"chem_tokenizer_path": chem_paths,
"scorer_filepath": scorer_path,
"mutator": mutator,
"intervals": intervals,
"batch_size": 5,
"top_k": 3,
"selection_ratio": 0.25,
"perform_crossover": True,
"crossover_type": "single_point",
"concat_order": ["substrate", "sequence", "product"],
}
return EnzymeOptimizer(**optimizer_config)


def optimize_sequences(optimizer):
"""Optimize sequences using the configured optimizer."""
return optimizer.optimize(
num_iterations=3, num_sequences=5, num_mutations=5, time_budget=3600
)


def main():
logging.basicConfig(level=logging.INFO)
scorer_path = initialize_environment()
(
substrate_smiles,
product_smiles,
sample_sequence,
intervals,
) = load_experiment_parameters()
optimizer = setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path
)
optimized_sequences, iteration_info = optimize_sequences(optimizer)
logging.info("Optimization completed.")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ scikit-learn>=1.0.0,<1.3.0
scikit-optimize>=0.8.1
scipy>=1.0.0
sentencepiece>=0.1.95
sentence_transformers>1.0,<=2.2.2
sympy>=1.10.1
tables>=3.7.0
tape-proteins>=0.4
Expand Down
4 changes: 2 additions & 2 deletions src/gt4sd/frameworks/enzeptional/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# Copyright (c) 2022 GT4SD team
# Copyright (c) 2024 GT4SD team
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand All @@ -26,4 +26,4 @@
Module for enzyme optimization.
"""

from .optimization import EnzymeOptimizer # noqa: F401
from .core import EnzymeOptimizer # noqa: F401
Loading

0 comments on commit 789cbcc

Please sign in to comment.