Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #36

Merged
merged 3 commits into from
Feb 4, 2021
Merged

Dev #36

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions flamingo/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@

#: Schema to validate the filters to apply for screening
SCHEMA_FILTERS = Schema({
# Filter out molecules with more a given anchor
Optional("single_anchor", default=True): bool,

# Include or exclude one or more functional group using smiles
Optional("include_functional_groups"): Schema([str]),
Optional("exclude_functional_groups"): Schema([str]),
Expand All @@ -39,6 +42,7 @@
Optional("bulkiness"): SCHEMA_BULKINESS,

Optional("scscore"): SCHEMA_ORDERING

})

#: Schema to validate the input for screening
Expand All @@ -52,6 +56,7 @@
# Functional group used as anchor
Optional("anchor", default="O(C=O)[H]"): str,


# path to the molecular coordinates of the Core to attach the ligands
Optional("core"): str,

Expand Down
30 changes: 23 additions & 7 deletions flamingo/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Fragments

from .features.featurizer import generate_fingerprints
from .cat_interface import compute_bulkiness
Expand Down Expand Up @@ -45,13 +46,12 @@ def split_filter_in_batches(opts: Options) -> None:
result_path = Path(opts.output_path)
result_path.mkdir(exist_ok=True, parents=True)


# Compute the number of batches to split
nbatches = len(molecules) // 1000
nbatches = nbatches if nbatches > 0 else 1

# Check precomputed batches
computed_batches = search_for_computed_batches()
computed_batches = search_for_computed_batches(opts.output_path)

for k, batch in enumerate(np.array_split(molecules, nbatches)):
if k < computed_batches:
Expand All @@ -77,7 +77,7 @@ def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) ->
molecules
:class:`pandas.Dataframe` with the molecular data.
opts
:class:`swan.utils.Options` options to run the filtering
:class:`flamingo.utils.Options` options to run the filtering
output_file
:class:`pathlib.Path`
"""
Expand All @@ -98,7 +98,9 @@ def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) ->
"include_functional_groups": include_functional_groups,
"exclude_functional_groups": exclude_functional_groups,
"bulkiness": filter_by_bulkiness,
"scscore": filter_by_scscore}
"scscore": filter_by_scscore,
"single_anchor": filter_single_anchor,
}

for key in opts.filters.keys():
if key in available_filters:
Expand Down Expand Up @@ -148,6 +150,20 @@ def has_substructure(patterns: FrozenSet, mol: Chem.Mol) -> bool:
return False if mol is None else any(mol.HasSubstructMatch(p) for p in patterns)


def filter_single_anchor(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
"""Exclude molecules that containing more that a single functional group used as anchor."""
logger.debug(f"exclude molecules with more than a single {opts.anchor} anchoring group")
anchor = Chem.MolFromSmiles(opts.anchor)
if anchor.HasSubstructMatch(Chem.MolFromSmarts("[OH]C=O")):
fun_fragment = np.vectorize(Fragments.fr_COO)
else:
msg = f"single_anchor_filter not implemented for anchor: {opts.anchor}"
raise NotImplementedError(msg)

singles = fun_fragment(molecules["rdkit_molecules"])
return molecules[singles == 1]


def filter_by_bulkiness(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
"""Filter the ligands that have a given bulkiness.

Expand Down Expand Up @@ -220,9 +236,9 @@ def merge_result():
results = pd.concat(files)
results.to_csv("FinalResults.csv", index=False)

def search_for_computed_batches():
def search_for_computed_batches(output_path: Path) -> int:
"""Check for batches that have been already computed."""
path = Path("results")
path = Path(output_path)
if path.exists():
computed = len(set(path.glob("batch_*")))
return computed - 1 if computed > 0 else 0
Expand All @@ -231,7 +247,7 @@ def search_for_computed_batches():

def main():
"""Parse the command line arguments to screen smiles."""
parser = argparse.ArgumentParser(description="modeller -i input.yml")
parser = argparse.ArgumentParser("smiles_screener")
# configure logger
parser.add_argument('-i', required=True,
help="Input file with options")
Expand Down
3 changes: 2 additions & 1 deletion tests/files/smiles_carboxylic.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
0,CCCCCCCCC=CCCCCCCCC(=O)O
1,CC(=O)O
2,O=C(O)Cc1ccccc1
3,CC(C(=O)O)O
3,CC(C(=O)O)O
4,C1=CC(=CC(=C1)C(=O)O)C(=O)O
12 changes: 12 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,15 @@ def test_filter_scscore_greater(tmp_path: Path) -> None:

expected = {"O=C(O)C1CNC2C3CC4C2N4C13"}
check_expected(opts, expected)


def test_single_anchor(tmp_path: Path) -> None:
"""Check that only molecules with a single Carboxylic acids are included."""
smiles_file = "smiles_carboxylic.csv"
filters = {"single_anchor": True}
opts = create_options(filters, smiles_file, tmp_path)
opts.anchor = "O(C=O)[H]"

expected = {"CCCCCCCCC=CCCCCCCCC(=O)O", "CC(=O)O", "O=C(O)Cc1ccccc1", "CC(O)C(=O)O"}

check_expected(opts, expected)