Data-CAT 0.3.0 (#25)

* Overhauled the .pdb storage system. * Introduced the `PDBContainer` class. * Renamed `dataCAT.database_functions` to `dataCAT.functions`.
nlesc-nano · Jun 19, 2020 · b0ce7c5 · b0ce7c5
1 parent e967c1c
commit b0ce7c5
Show file tree

Hide file tree

Showing 47 changed files with 1,962 additions and 158 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,11 @@ All notable changes to this project will be documented in this file.
 This project adheres to `Semantic Versioning <http://semver.org/>`_.
 
 
+0.3.0
+*****
+* Overhaul of the .pdb storage system.
+
+
 0.2.2
 *****
 * Updated the documentation (see https://github.com/nlesc-nano/CAT/pull/123).

diff --git a/README.rst b/README.rst
@@ -15,7 +15,7 @@
 
 
 ##############
-Data-CAT 0.2.2
+Data-CAT 0.3.0
 ##############
 
 Data-CAT is a databasing framework for the Compound Attachment Tools package (CAT_).

diff --git a/tests/conftest.py → conftest.py b/tests/conftest.py → conftest.py
diff --git a/dataCAT/__init__.py b/dataCAT/__init__.py
@@ -9,17 +9,19 @@
 version_info = VersionInfo.from_str(__version__)
 del VersionInfo
 
+from .functions import df_to_mongo_dict, int_to_slice
 from .df_proxy import DFProxy
+from .pdb_array import DTYPE_ATOM, DTYPE_BOND, PDBContainer
 from .context_managers import OpenYaml, OpenLig, OpenQD
-from .database_functions import df_to_mongo_dict
 from .database import Database
 
 __author__ = 'B. F. van Beek'
 __email__ = 'b.f.van.beek@vu.nl'
 
 __all__ = [
+    'df_to_mongo_dict', 'int_to_slice',
     'DFProxy',
+    'DTYPE_ATOM', 'DTYPE_BOND', 'PDBContainer',
     'OpenYaml', 'OpenLig', 'OpenQD',
-    'df_to_mongo_dict',
     'Database',
 ]
diff --git a/dataCAT/__version__.py b/dataCAT/__version__.py
@@ -1,3 +1,3 @@
 """The **Data-CAT** version."""
 
-__version__ = '0.2.2'
+__version__ = '0.3.0'
diff --git a/dataCAT/create_database.py b/dataCAT/create_database.py
@@ -36,13 +36,15 @@
 from CAT.logger import logger
 from CAT import version_info as CAT_VERSION  # noqa: N812
 
+from . import version_info as DATACAT_VERSION  # noqa: N812
+from .pdb_array import DTYPE_ATOM, DTYPE_BOND, PDBContainer
+from .functions import from_pdb_array
+
 try:
     from nanoCAT import version_info as NANOCAT_VERSION  # noqa: N812
 except ImportError:
     NANOCAT_VERSION = VersionInfo(-1, -1, -1)
 
-from . import version_info as DATACAT_VERSION  # noqa: N812
-
 __all__: List[str] = []
 
 Ligand = Literal['ligand', 'ligand_no_opt']
@@ -158,7 +160,7 @@ def _create_hdf5(path, name='structures.hdf5'):  # noqa: E302
 
     # Define arguments for 2D datasets
     dataset_names = ('core', 'core_no_opt', 'ligand', 'ligand_no_opt', 'qd', 'qd_no_opt', )
-    kwargs = {'chunks': True, 'maxshape': (None, None), 'compression': 'gzip'}
+    kwargs = {'chunks': True, 'compression': 'gzip'}
 
     # Define arguments for 3D datasets
     kwargs_3d = {'chunks': True, 'maxshape': (None, None, None), 'compression': 'gzip'}
@@ -172,21 +174,48 @@ def _create_hdf5(path, name='structures.hdf5'):  # noqa: E302
     with h5py.File(path, 'a', libver='latest') as f:
         # Store the version of CAT, nano-CAT and data-CAT
         if 'CAT.__version__' not in f:
-            f.create_dataset(data=[CAT_VERSION], name='CAT.__version__', **kwargs_version)
+            f.create_dataset('CAT.__version__', data=[CAT_VERSION], **kwargs_version)
         if 'nanoCAT.__version__' not in f:
-            f.create_dataset(data=[NANOCAT_VERSION], name='nanoCAT.__version__', **kwargs_version)
+            f.create_dataset('nanoCAT.__version__', data=[NANOCAT_VERSION], **kwargs_version)
         if 'dataCAT.__version__' not in f:
-            f.create_dataset(data=[DATACAT_VERSION], name='dataCAT.__version__', **kwargs_version)
+            f.create_dataset('dataCAT.__version__', data=[DATACAT_VERSION], **kwargs_version)
 
         # Create new 2D datasets
-        iterator_2d = (name_ for name_ in dataset_names if name_ not in f)
-        for name_ in iterator_2d:
-            f.create_dataset(name=name_, data=np.empty((0, 1), dtype='S80'), **kwargs)
+        for grp_name in dataset_names:
+            if isinstance(f.get(grp_name), h5py.Dataset):
+                logger.info(f'Updating h5py Dataset to data-CAT >= 0.3 style: {grp_name!r}')
+                iterator = (from_pdb_array(pdb, rdmol=False, warn=False) for pdb in f[grp_name])
+                pdb = PDBContainer.from_molecules(iterator)
+                del f[grp_name]
+            elif grp_name in f:
+                continue
+            else:
+                pdb = None
+
+            grp = f.create_group(grp_name, track_order=True)
+            grp.attrs['__doc__'] = b"A set of datasets representing `dataCAT.PDBTuple`."
+
+            dtype1 = list(DTYPE_ATOM.items())
+            dtype2 = list(DTYPE_BOND.items())
+
+            grp.create_dataset('atoms', shape=(0, 0), maxshape=(None, None), dtype=dtype1, **kwargs)
+            grp.create_dataset('bonds', shape=(0, 0), maxshape=(None, None), dtype=dtype2, **kwargs)
+            grp.create_dataset('atom_count', shape=(0,), maxshape=(None,), dtype='int32')
+            grp.create_dataset('bond_count', shape=(0,), maxshape=(None,), dtype='int32')
+
+            grp['atoms'].attrs['__doc__'] = b"A dataset representing `dataCATPDBTuple.atoms`."
+            grp['bonds'].attrs['__doc__'] = b"A dataset representing `PDBTuple.bonds`."
+            grp['atom_count'].attrs['__doc__'] = b"A dataset representing `PDBTuple.atom_count`."
+            grp['bond_count'].attrs['__doc__'] = b"A dataset representing `PDBTuple.bond_count`."
+
+            if pdb is not None:
+                pdb.to_hdf5(grp, mode='append')
 
         # Create new 3D datasets
-        iterator_3d = (name_ for name_ in dataset_names_3d if name_ not in f)
-        for name_ in iterator_3d:
-            f.create_dataset(name=name_, data=np.empty((0, 1, 1), dtype='S120'), **kwargs_3d)
+        iterator_3d = (grp_name for grp_name in dataset_names_3d if grp_name not in f)
+        for grp_name in iterator_3d:
+            f.create_dataset(grp_name, data=np.empty((0, 1, 1), dtype='S120'), **kwargs_3d)
+
     return path
 
 

diff --git a/dataCAT/database.py b/dataCAT/database.py
@@ -14,16 +14,17 @@
 """
 
 import reprlib
+import warnings
 import textwrap
-from os import getcwd
+from os import getcwd, PathLike
 from os.path import abspath
 from time import sleep
 from types import MappingProxyType
 from functools import partial
 from itertools import count
 from typing import (
     Optional, Sequence, List, Union, Any, Dict, TypeVar, Mapping,
-    overload, TYPE_CHECKING, Tuple, Type
+    overload, Tuple, Type
 )
 
 import h5py
@@ -35,18 +36,13 @@
 from rdkit.Chem import Mol
 from scm.plams import Settings, Molecule
 from nanoutils import PathType, TypedDict
-from CAT.logger import logger
 from CAT.mol_utils import from_rdmol  # noqa: F401
 from CAT.workflows import HDF5_INDEX, OPT, MOL
 
 from .create_database import _create_csv, _create_yaml, _create_hdf5, _create_mongodb, QD, Ligand
 from .context_managers import OpenYaml, OpenLig, OpenQD
-from .database_functions import (
-    df_to_mongo_dict, even_index, from_pdb_array, sanitize_yaml_settings, as_pdb_array
-)
-
-if TYPE_CHECKING:
-    from os import PathLike  # noqa: F401
+from .functions import df_to_mongo_dict, even_index, sanitize_yaml_settings
+from .pdb_array import PDBContainer
 
 __all__ = ['Database']
 
@@ -477,17 +473,15 @@ def update_hdf5(self, df: pd.DataFrame,
         # Add new entries to the database
         self.hdf5_availability()
         with self.hdf5('r+', libver='latest') as f:
-            i, j = f[database].shape
-
             if new.any():
-                pdb_array = as_pdb_array(df[MOL][new.index], min_size=j)
+                mol_series = df.loc[new.index, MOL]
+                pdb_new = PDBContainer.from_molecules(mol_series)
+                pdb_new.to_hdf5(f[database], mode='append')
 
-                # Reshape and update **self.hdf5**
-                k = i + pdb_array.shape[0]
-                f[database].shape = k, pdb_array.shape[1]
-                f[database][i:k] = pdb_array
+                j = len(f[database]['atoms'])
+                i = j - len(mol_series)
+                ret = pd.Series(np.arange(i, j), index=new.index, name=HDF5_INDEX)
 
-                ret = pd.Series(np.arange(i, k), index=new.index, name=HDF5_INDEX)
                 df.update(ret, overwrite=True)
                 if opt:
                     df.loc[new.index, OPT] = True
@@ -496,22 +490,20 @@ def update_hdf5(self, df: pd.DataFrame,
 
             # If **overwrite** is *True*
             if overwrite and old.any():
-                ar = as_pdb_array(df[MOL][old.index], min_size=j)
+                old.sort_values(inplace=True)
+                mol_series = df.loc[old.index, MOL]
 
-                # Ensure that the hdf5 indices are sorted
-                idx = np.argsort(old)
-                old = old[idx]
-                f[database][old] = ar[idx]
+                pdb_old = PDBContainer.from_molecules(mol_series)
+                pdb_old.to_hdf5(f[database], mode='update', idx=old.values)
                 if opt:
-                    df.loc[idx.index, OPT] = True
-
+                    df.loc[old.index, OPT] = True
         return ret
 
     def _update_hdf5_settings(self, df: pd.DataFrame, column: str) -> None:
         """Export all files in **df[column]** to hdf5 dataset **column**."""
         # Add new entries to the database
         self.hdf5_availability()
-        with self.hdf5('r+') as f:
+        with self.hdf5('r+', libver='latest') as f:
             i, j, k = f[column].shape
 
             # Create a 3D array of input files
@@ -670,17 +662,15 @@ def from_hdf5(self, index: Union[slice, Sequence[int]],
             A list of PLAMS or RDKit molecules.
 
         """
-        # Convert **index** to an array if it is a series or dataframe
-        if hasattr(index, '__array__'):
-            index = np.asarray(index).tolist()
-
         # Open the database and pull entries
         self.hdf5_availability()
-        with self.hdf5('r') as f:
-            pdb_array = f[database][index]
+        with self.hdf5('r', libver='latest') as f:
+            pdb = PDBContainer.from_hdf5(f[database], index)
+            mol_list = pdb.to_molecules()
 
-        # Return a list of RDKit or PLAMS molecules
-        return [from_pdb_array(mol, rdmol=rdmol) for mol in pdb_array]
+        if rdmol:
+            return [from_rdmol(mol) for mol in mol_list]
+        return mol_list
 
     def hdf5_availability(self, timeout: float = 5.0,
                           max_attempts: Optional[int] = 10) -> None:
@@ -710,7 +700,10 @@ def hdf5_availability(self, timeout: float = 5.0,
         """
         err = (f"h5py.File({self.hdf5.args[0]!r}) is currently unavailable; "
                f"repeating attempt in {timeout:1.1f} seconds")
+
         i = max_attempts if max_attempts is not None else np.inf
+        if i <= 0:
+            raise ValueError(f"'max_attempts' must be larger than 0; observed value: {i!r}")
 
         while i:
             try:
@@ -719,7 +712,7 @@ def hdf5_availability(self, timeout: float = 5.0,
             except OSError as ex:  # the .hdf5 file cannot be safely opened yet
                 warn = ResourceWarning(err)
                 warn.__cause__ = exception = ex
-                logger.warning(warn)
+                warnings.warn(warn)
                 sleep(timeout)
             i -= 1