Skip to content

Commit

Permalink
Data-CAT 0.3.0 (#25)
Browse files Browse the repository at this point in the history
* Overhauled the .pdb storage system.
* Introduced the `PDBContainer` class.
* Renamed `dataCAT.database_functions` to `dataCAT.functions`.
  • Loading branch information
BvB93 authored Jun 19, 2020
1 parent e967c1c commit b0ce7c5
Show file tree
Hide file tree
Showing 47 changed files with 1,962 additions and 158 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ All notable changes to this project will be documented in this file.
This project adheres to `Semantic Versioning <http://semver.org/>`_.


0.3.0
*****
* Overhaul of the .pdb storage system.


0.2.2
*****
* Updated the documentation (see https://github.com/nlesc-nano/CAT/pull/123).
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


##############
Data-CAT 0.2.2
Data-CAT 0.3.0
##############

Data-CAT is a databasing framework for the Compound Attachment Tools package (CAT_).
Expand Down
File renamed without changes.
6 changes: 4 additions & 2 deletions dataCAT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@
version_info = VersionInfo.from_str(__version__)
del VersionInfo

from .functions import df_to_mongo_dict, int_to_slice
from .df_proxy import DFProxy
from .pdb_array import DTYPE_ATOM, DTYPE_BOND, PDBContainer
from .context_managers import OpenYaml, OpenLig, OpenQD
from .database_functions import df_to_mongo_dict
from .database import Database

__author__ = 'B. F. van Beek'
__email__ = 'b.f.van.beek@vu.nl'

__all__ = [
'df_to_mongo_dict', 'int_to_slice',
'DFProxy',
'DTYPE_ATOM', 'DTYPE_BOND', 'PDBContainer',
'OpenYaml', 'OpenLig', 'OpenQD',
'df_to_mongo_dict',
'Database',
]
2 changes: 1 addition & 1 deletion dataCAT/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""The **Data-CAT** version."""

__version__ = '0.2.2'
__version__ = '0.3.0'
53 changes: 41 additions & 12 deletions dataCAT/create_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@
from CAT.logger import logger
from CAT import version_info as CAT_VERSION # noqa: N812

from . import version_info as DATACAT_VERSION # noqa: N812
from .pdb_array import DTYPE_ATOM, DTYPE_BOND, PDBContainer
from .functions import from_pdb_array

try:
from nanoCAT import version_info as NANOCAT_VERSION # noqa: N812
except ImportError:
NANOCAT_VERSION = VersionInfo(-1, -1, -1)

from . import version_info as DATACAT_VERSION # noqa: N812

__all__: List[str] = []

Ligand = Literal['ligand', 'ligand_no_opt']
Expand Down Expand Up @@ -158,7 +160,7 @@ def _create_hdf5(path, name='structures.hdf5'): # noqa: E302

# Define arguments for 2D datasets
dataset_names = ('core', 'core_no_opt', 'ligand', 'ligand_no_opt', 'qd', 'qd_no_opt', )
kwargs = {'chunks': True, 'maxshape': (None, None), 'compression': 'gzip'}
kwargs = {'chunks': True, 'compression': 'gzip'}

# Define arguments for 3D datasets
kwargs_3d = {'chunks': True, 'maxshape': (None, None, None), 'compression': 'gzip'}
Expand All @@ -172,21 +174,48 @@ def _create_hdf5(path, name='structures.hdf5'): # noqa: E302
with h5py.File(path, 'a', libver='latest') as f:
# Store the version of CAT, nano-CAT and data-CAT
if 'CAT.__version__' not in f:
f.create_dataset(data=[CAT_VERSION], name='CAT.__version__', **kwargs_version)
f.create_dataset('CAT.__version__', data=[CAT_VERSION], **kwargs_version)
if 'nanoCAT.__version__' not in f:
f.create_dataset(data=[NANOCAT_VERSION], name='nanoCAT.__version__', **kwargs_version)
f.create_dataset('nanoCAT.__version__', data=[NANOCAT_VERSION], **kwargs_version)
if 'dataCAT.__version__' not in f:
f.create_dataset(data=[DATACAT_VERSION], name='dataCAT.__version__', **kwargs_version)
f.create_dataset('dataCAT.__version__', data=[DATACAT_VERSION], **kwargs_version)

# Create new 2D datasets
iterator_2d = (name_ for name_ in dataset_names if name_ not in f)
for name_ in iterator_2d:
f.create_dataset(name=name_, data=np.empty((0, 1), dtype='S80'), **kwargs)
for grp_name in dataset_names:
if isinstance(f.get(grp_name), h5py.Dataset):
logger.info(f'Updating h5py Dataset to data-CAT >= 0.3 style: {grp_name!r}')
iterator = (from_pdb_array(pdb, rdmol=False, warn=False) for pdb in f[grp_name])
pdb = PDBContainer.from_molecules(iterator)
del f[grp_name]
elif grp_name in f:
continue
else:
pdb = None

grp = f.create_group(grp_name, track_order=True)
grp.attrs['__doc__'] = b"A set of datasets representing `dataCAT.PDBTuple`."

dtype1 = list(DTYPE_ATOM.items())
dtype2 = list(DTYPE_BOND.items())

grp.create_dataset('atoms', shape=(0, 0), maxshape=(None, None), dtype=dtype1, **kwargs)
grp.create_dataset('bonds', shape=(0, 0), maxshape=(None, None), dtype=dtype2, **kwargs)
grp.create_dataset('atom_count', shape=(0,), maxshape=(None,), dtype='int32')
grp.create_dataset('bond_count', shape=(0,), maxshape=(None,), dtype='int32')

grp['atoms'].attrs['__doc__'] = b"A dataset representing `dataCATPDBTuple.atoms`."
grp['bonds'].attrs['__doc__'] = b"A dataset representing `PDBTuple.bonds`."
grp['atom_count'].attrs['__doc__'] = b"A dataset representing `PDBTuple.atom_count`."
grp['bond_count'].attrs['__doc__'] = b"A dataset representing `PDBTuple.bond_count`."

if pdb is not None:
pdb.to_hdf5(grp, mode='append')

# Create new 3D datasets
iterator_3d = (name_ for name_ in dataset_names_3d if name_ not in f)
for name_ in iterator_3d:
f.create_dataset(name=name_, data=np.empty((0, 1, 1), dtype='S120'), **kwargs_3d)
iterator_3d = (grp_name for grp_name in dataset_names_3d if grp_name not in f)
for grp_name in iterator_3d:
f.create_dataset(grp_name, data=np.empty((0, 1, 1), dtype='S120'), **kwargs_3d)

return path


Expand Down
61 changes: 27 additions & 34 deletions dataCAT/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
"""

import reprlib
import warnings
import textwrap
from os import getcwd
from os import getcwd, PathLike
from os.path import abspath
from time import sleep
from types import MappingProxyType
from functools import partial
from itertools import count
from typing import (
Optional, Sequence, List, Union, Any, Dict, TypeVar, Mapping,
overload, TYPE_CHECKING, Tuple, Type
overload, Tuple, Type
)

import h5py
Expand All @@ -35,18 +36,13 @@
from rdkit.Chem import Mol
from scm.plams import Settings, Molecule
from nanoutils import PathType, TypedDict
from CAT.logger import logger
from CAT.mol_utils import from_rdmol # noqa: F401
from CAT.workflows import HDF5_INDEX, OPT, MOL

from .create_database import _create_csv, _create_yaml, _create_hdf5, _create_mongodb, QD, Ligand
from .context_managers import OpenYaml, OpenLig, OpenQD
from .database_functions import (
df_to_mongo_dict, even_index, from_pdb_array, sanitize_yaml_settings, as_pdb_array
)

if TYPE_CHECKING:
from os import PathLike # noqa: F401
from .functions import df_to_mongo_dict, even_index, sanitize_yaml_settings
from .pdb_array import PDBContainer

__all__ = ['Database']

Expand Down Expand Up @@ -477,17 +473,15 @@ def update_hdf5(self, df: pd.DataFrame,
# Add new entries to the database
self.hdf5_availability()
with self.hdf5('r+', libver='latest') as f:
i, j = f[database].shape

if new.any():
pdb_array = as_pdb_array(df[MOL][new.index], min_size=j)
mol_series = df.loc[new.index, MOL]
pdb_new = PDBContainer.from_molecules(mol_series)
pdb_new.to_hdf5(f[database], mode='append')

# Reshape and update **self.hdf5**
k = i + pdb_array.shape[0]
f[database].shape = k, pdb_array.shape[1]
f[database][i:k] = pdb_array
j = len(f[database]['atoms'])
i = j - len(mol_series)
ret = pd.Series(np.arange(i, j), index=new.index, name=HDF5_INDEX)

ret = pd.Series(np.arange(i, k), index=new.index, name=HDF5_INDEX)
df.update(ret, overwrite=True)
if opt:
df.loc[new.index, OPT] = True
Expand All @@ -496,22 +490,20 @@ def update_hdf5(self, df: pd.DataFrame,

# If **overwrite** is *True*
if overwrite and old.any():
ar = as_pdb_array(df[MOL][old.index], min_size=j)
old.sort_values(inplace=True)
mol_series = df.loc[old.index, MOL]

# Ensure that the hdf5 indices are sorted
idx = np.argsort(old)
old = old[idx]
f[database][old] = ar[idx]
pdb_old = PDBContainer.from_molecules(mol_series)
pdb_old.to_hdf5(f[database], mode='update', idx=old.values)
if opt:
df.loc[idx.index, OPT] = True

df.loc[old.index, OPT] = True
return ret

def _update_hdf5_settings(self, df: pd.DataFrame, column: str) -> None:
"""Export all files in **df[column]** to hdf5 dataset **column**."""
# Add new entries to the database
self.hdf5_availability()
with self.hdf5('r+') as f:
with self.hdf5('r+', libver='latest') as f:
i, j, k = f[column].shape

# Create a 3D array of input files
Expand Down Expand Up @@ -670,17 +662,15 @@ def from_hdf5(self, index: Union[slice, Sequence[int]],
A list of PLAMS or RDKit molecules.
"""
# Convert **index** to an array if it is a series or dataframe
if hasattr(index, '__array__'):
index = np.asarray(index).tolist()

# Open the database and pull entries
self.hdf5_availability()
with self.hdf5('r') as f:
pdb_array = f[database][index]
with self.hdf5('r', libver='latest') as f:
pdb = PDBContainer.from_hdf5(f[database], index)
mol_list = pdb.to_molecules()

# Return a list of RDKit or PLAMS molecules
return [from_pdb_array(mol, rdmol=rdmol) for mol in pdb_array]
if rdmol:
return [from_rdmol(mol) for mol in mol_list]
return mol_list

def hdf5_availability(self, timeout: float = 5.0,
max_attempts: Optional[int] = 10) -> None:
Expand Down Expand Up @@ -710,7 +700,10 @@ def hdf5_availability(self, timeout: float = 5.0,
"""
err = (f"h5py.File({self.hdf5.args[0]!r}) is currently unavailable; "
f"repeating attempt in {timeout:1.1f} seconds")

i = max_attempts if max_attempts is not None else np.inf
if i <= 0:
raise ValueError(f"'max_attempts' must be larger than 0; observed value: {i!r}")

while i:
try:
Expand All @@ -719,7 +712,7 @@ def hdf5_availability(self, timeout: float = 5.0,
except OSError as ex: # the .hdf5 file cannot be safely opened yet
warn = ResourceWarning(err)
warn.__cause__ = exception = ex
logger.warning(warn)
warnings.warn(warn)
sleep(timeout)
i -= 1

Expand Down
Loading

0 comments on commit b0ce7c5

Please sign in to comment.