nlesc-nano · felipeZ · Feb 4, 2021 · Jan 25, 2021 · Jan 26, 2021 · Feb 4, 2021
diff --git a/flamingo/schemas.py b/flamingo/schemas.py
@@ -31,6 +31,9 @@
 
 #: Schema to validate the filters to apply for screening
 SCHEMA_FILTERS = Schema({
+    # Filter out molecules with more a given anchor
+    Optional("single_anchor", default=True): bool,
+
     # Include or exclude one or more functional group using smiles
     Optional("include_functional_groups"): Schema([str]),
     Optional("exclude_functional_groups"): Schema([str]),
@@ -39,6 +42,7 @@
     Optional("bulkiness"): SCHEMA_BULKINESS,
 
     Optional("scscore"): SCHEMA_ORDERING
+
 })
 
 #: Schema to validate the input for screening
@@ -52,6 +56,7 @@
     # Functional group used as anchor
     Optional("anchor", default="O(C=O)[H]"): str,
 
+
     # path to the molecular coordinates of the Core to attach the ligands
     Optional("core"): str,
 

diff --git a/flamingo/screen.py b/flamingo/screen.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pandas as pd
 from rdkit import Chem
+from rdkit.Chem import Fragments
 
 from .features.featurizer import generate_fingerprints
 from .cat_interface import compute_bulkiness
@@ -45,13 +46,12 @@ def split_filter_in_batches(opts: Options) -> None:
     result_path = Path(opts.output_path)
     result_path.mkdir(exist_ok=True, parents=True)
 
-
     # Compute the number of batches to split
     nbatches = len(molecules) // 1000
     nbatches = nbatches if nbatches > 0 else 1
 
     # Check precomputed batches
-    computed_batches = search_for_computed_batches()
+    computed_batches = search_for_computed_batches(opts.output_path)
 
     for k, batch in enumerate(np.array_split(molecules, nbatches)):
         if k < computed_batches:
@@ -77,7 +77,7 @@ def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) ->
     molecules
         :class:`pandas.Dataframe` with the molecular data.
     opts
-        :class:`swan.utils.Options` options to run the filtering
+        :class:`flamingo.utils.Options` options to run the filtering
     output_file
         :class:`pathlib.Path`
     """
@@ -98,7 +98,9 @@ def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) ->
         "include_functional_groups": include_functional_groups,
         "exclude_functional_groups": exclude_functional_groups,
         "bulkiness": filter_by_bulkiness,
-        "scscore": filter_by_scscore}
+        "scscore": filter_by_scscore,
+        "single_anchor": filter_single_anchor,
+        }
 
     for key in opts.filters.keys():
         if key in available_filters:
@@ -148,6 +150,20 @@ def has_substructure(patterns: FrozenSet, mol: Chem.Mol) -> bool:
     return False if mol is None else any(mol.HasSubstructMatch(p) for p in patterns)
 
 
+def filter_single_anchor(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
+    """Exclude molecules that containing more that a single functional group used as anchor."""
+    logger.debug(f"exclude molecules with more than a single {opts.anchor} anchoring group")
+    anchor = Chem.MolFromSmiles(opts.anchor)
+    if anchor.HasSubstructMatch(Chem.MolFromSmarts("[OH]C=O")):
+        fun_fragment = np.vectorize(Fragments.fr_COO)
+    else:
+        msg = f"single_anchor_filter not implemented for anchor: {opts.anchor}"
+        raise NotImplementedError(msg)
+
+    singles = fun_fragment(molecules["rdkit_molecules"])
+    return molecules[singles == 1]
+
+
 def filter_by_bulkiness(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
     """Filter the ligands that have a given bulkiness.
 
@@ -220,9 +236,9 @@ def merge_result():
         results = pd.concat(files)
         results.to_csv("FinalResults.csv", index=False)
 
-def search_for_computed_batches():
+def search_for_computed_batches(output_path: Path) -> int:
     """Check for batches that have been already computed."""
-    path = Path("results")
+    path = Path(output_path)
     if path.exists():
         computed = len(set(path.glob("batch_*")))
         return computed - 1 if computed > 0 else 0
@@ -231,7 +247,7 @@ def search_for_computed_batches():
 
 def main():
     """Parse the command line arguments to screen smiles."""
-    parser = argparse.ArgumentParser(description="modeller -i input.yml")
+    parser = argparse.ArgumentParser("smiles_screener")
     # configure logger
     parser.add_argument('-i', required=True,
                         help="Input file with options")

diff --git a/tests/files/smiles_carboxylic.csv b/tests/files/smiles_carboxylic.csv
@@ -2,4 +2,5 @@
 0,CCCCCCCCC=CCCCCCCCC(=O)O
 1,CC(=O)O
 2,O=C(O)Cc1ccccc1
-3,CC(C(=O)O)O
+3,CC(C(=O)O)O
+4,C1=CC(=CC(=C1)C(=O)O)C(=O)O
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -156,3 +156,15 @@ def test_filter_scscore_greater(tmp_path: Path) -> None:
 
     expected = {"O=C(O)C1CNC2C3CC4C2N4C13"}
     check_expected(opts, expected)
+
+
+def test_single_anchor(tmp_path: Path) -> None:
+    """Check that only molecules with a single Carboxylic acids are included."""
+    smiles_file = "smiles_carboxylic.csv"
+    filters = {"single_anchor": True}
+    opts = create_options(filters, smiles_file, tmp_path)
+    opts.anchor = "O(C=O)[H]"
+
+    expected = {"CCCCCCCCC=CCCCCCCCC(=O)O", "CC(=O)O", "O=C(O)Cc1ccccc1", "CC(O)C(=O)O"}
+
+    check_expected(opts, expected)