Merge pull request #35 from nlesc-nano/bde_update

0.4.3 ***** * Improved interaction between the database and BDE module * Cleaned up BDE module * HDF5 indices are now always sorted when itneraction with the database
nlesc-nano · Jun 25, 2019 · d5c4029 · d5c4029
2 parents 3e661d7 + 821f2fa
commit d5c4029
Show file tree

Hide file tree

Showing 9 changed files with 128 additions and 68 deletions.
diff --git a/CAT/__version__.py b/CAT/__version__.py
@@ -1 +1 @@
-__version__ = '0.4.2'
+__version__ = '0.4.3'
diff --git a/CAT/analysis/ligand_bde.py b/CAT/analysis/ligand_bde.py
@@ -40,11 +40,10 @@ def init_bde(qd_df, arg):
     # Check if the calculation has been done already
     if not overwrite and 'qd' in arg.optional.database.read:
         with data.open_csv_qd(data.csv_qd, write=False) as db:
-            try:
-                for i in db[['BDE label', 'BDE dE', 'BDE dG', 'BDE ddG']]:
-                    qd_df[i] = np.nan
-            except KeyError:
-                pass
+            key_ar = np.array(['BDE label', 'BDE dE', 'BDE dG', 'BDE ddG'])
+            bool_ar = np.isin(key_ar, db.columns.levels[0])
+            for i in db[key_ar[bool_ar]]:
+                qd_df[i] = np.nan
             data.from_csv(qd_df, database='QD', get_mol=False)
         qd_df.dropna(axis='columns', how='all', inplace=True)
 
@@ -67,61 +66,65 @@ def _bde_w_dg(qd_df, arg):
     """
     data = Database(arg.optional.database.dirname)
     overwrite = 'qd' in arg.optional.database.overwrite
-
-    # Prepare the job settings
     j1, j2 = arg.optional.qd.dissociate.job1, arg.optional.qd.dissociate.job2
     s1, s2 = arg.optional.qd.dissociate.s1, arg.optional.qd.dissociate.s2
+    ion = arg.optional.qd.dissociate.core_atom
+    lig_count = arg.optional.qd.dissociate.lig_count
 
+    # Identify previously calculated results
     try:
         has_na = qd_df[['BDE dE', 'BDE dG']].isna().all(axis='columns')
         if not has_na.any():
             return
     except KeyError:
         has_na = qd_df.index
 
-    for i, mol in qd_df['mol'][has_na].iteritems():
+    for idx, mol in qd_df['mol'][has_na].iteritems():
         # Create XYn and all XYn-dissociated quantum dots
-        xyn = get_xy2(mol)
+        xyn = get_xy2(mol, ion, lig_count)
         if not arg.optional.qd.dissociate.core_index:
             mol_wo_xyn = dissociate_ligand(mol, arg)
         else:
             mol_wo_xyn = dissociate_ligand2(mol, arg)
 
         # Construct new columns for **qd_df**
-        values = [i.properties.df_index for i in mol_wo_xyn]
-        n = len(values)
-        sub_idx = np.arange(n)
-        super_idx = ['BDE label', 'BDE dE', 'BDE ddG', 'BDE dG']
-        idx = list(product(super_idx, sub_idx))
-        for j in idx:
-            if j not in qd_df.index:
-                qd_df[j] = np.nan
+        labels = [m.properties.df_index for m in mol_wo_xyn]
+        sub_idx = np.arange(len(labels)).astype(str, copy=False)
+        try:
+            n = qd_df['BDE label'].shape[1]
+        except KeyError:
+            n = 0
+        if len(labels) > n:
+            for i in sub_idx[n:]:
+                qd_df[('BDE label', i)] = qd_df[('BDE dE', i)] = qd_df[('BDE ddG', i)] = np.nan
 
         # Prepare slices
-        label_slice = i, idx[0:n]
-        dE_slice = i, idx[n:2*n]
-        ddG_slice = i, idx[2*n:3*n]
+        label_slice = idx, list(product(['BDE label'], sub_idx))
+        dE_slice = idx, list(product(['BDE dE'], sub_idx))
+        ddG_slice = idx, list(product(['BDE ddG'], sub_idx))
 
         # Run the BDE calculations
         init(path=mol.properties.path, folder='BDE')
         config.default_jobmanager.settings.hashing = None
-        qd_df.loc[label_slice] = values
+        qd_df.loc[label_slice] = labels
         qd_df.loc[dE_slice] = get_bde_dE(mol, xyn, mol_wo_xyn, job=j1, s=s1)
         qd_df.loc[ddG_slice] = get_bde_ddG(mol, xyn, mol_wo_xyn, job=j2, s=s2)
         finish()
+
     qd_df['BDE dG'] = qd_df['BDE dE'] + qd_df['BDE ddG']
 
     # Update the database
     if 'qd' in arg.optional.database.write:
-        value1 = qmflows.singlepoint['specific'][type_to_string(j1)].copy()
-        value1.update(s1)
-        value2 = qmflows.freq['specific'][type_to_string(j2)].copy()
-        value2.update(s2)
-        recipe = Settings()
-        recipe['BDE 1'] = {'key': j1, 'value': value1}
-        recipe['BDE 2'] = {'key': j2, 'value': value2}
-        data.update_csv(qd_df, database='QD', job_recipe=recipe, overwrite=overwrite,
-                        columns=[('settings', 'BDE 1'), ('settings', 'BDE 2')]+idx)
+        qd_df.sort_index(axis='columns', inplace=True)
+        column_tup = ('BDE label', 'BDE dE', 'BDE ddG', 'BDE dG')
+        kwarg = {
+            'database': 'QD',
+            'job_recipe': get_recipe(j1, s1, j2, s2),
+            'overwrite': overwrite,
+            'columns': [('settings', 'BDE 1'), ('settings', 'BDE 2')]
+        }
+        kwarg['columns'] += [(i, j) for i, j in qd_df.columns if i in column_tup]
+        data.update_csv(qd_df, **kwarg)
 
 
 def _bde_wo_dg(qd_df, arg):
@@ -132,62 +135,83 @@ def _bde_wo_dg(qd_df, arg):
     :parameter arg: A settings object containing all (optional) arguments.
     :type arg: |plams.Settings|_ (superclass: |dict|_).
     """
+    # Unpack arguments
     data = Database(arg.optional.database.dirname)
     overwrite = 'qd' in arg.optional.database.overwrite
-
-    # Prepare the job settings
     j1 = arg.optional.qd.dissociate.job1
     s1 = arg.optional.qd.dissociate.s1
+    ion = arg.optional.qd.dissociate.core_atom
+    lig_count = arg.optional.qd.dissociate.lig_count
 
+    # Identify previously calculated results
     try:
         has_na = qd_df['BDE dE'].isna().all(axis='columns')
         if not has_na.any():
             return
     except KeyError:
         has_na = qd_df.index
 
-    for i, mol in qd_df['mol'][has_na].iteritems():
+    for idx, mol in qd_df['mol'][has_na].iteritems():
         # Create XYn and all XYn-dissociated quantum dots
-        xyn = get_xy2(mol)
+        xyn = get_xy2(mol, ion, lig_count)
         if not arg.optional.qd.dissociate.core_index:
             mol_wo_xyn = dissociate_ligand(mol, arg)
         else:
             mol_wo_xyn = dissociate_ligand2(mol, arg)
 
         # Construct new columns for **qd_df**
-        values = [i.properties.df_index for i in mol_wo_xyn]
-        n = len(values)
-        sub_idx = np.arange(n)
-        super_idx = ['BDE label', 'BDE dE']
-        idx = list(product(super_idx, sub_idx))
-        for j in idx:
-            if j not in qd_df.index:
-                qd_df[j] = np.nan
+        labels = [m.properties.df_index for m in mol_wo_xyn]
+        sub_idx = np.arange(len(labels)).astype(str, copy=False)
+        try:
+            n = qd_df['BDE label'].shape[1]
+        except KeyError:
+            n = 0
+        if len(labels) > n:
+            for i in sub_idx[n:]:
+                qd_df[('BDE label', i)] = qd_df[('BDE dE', i)] = np.nan
 
         # Prepare slices
-        label_slice = i, idx[0:n]
-        dE_slice = i, idx[n:2*n]
+        label_slice = idx, list(product(['BDE label'], sub_idx))
+        dE_slice = idx, list(product(['BDE dE'], sub_idx))
 
         # Run the BDE calculations
         init(path=mol.properties.path, folder='BDE')
         config.default_jobmanager.settings.hashing = None
-        qd_df.loc[label_slice] = values
+        qd_df.loc[label_slice] = labels
         qd_df.loc[dE_slice] = get_bde_dE(mol, xyn, mol_wo_xyn, job=j1, s=s1)
         finish()
 
     # Update the database
     if 'qd' in arg.optional.database.write:
-        recipe = Settings()
-        value = qmflows.singlepoint[type_to_string(j1)]
-        value.update(s1)
-        recipe['BDE 1'] = {'key': j1, 'value': value}
-        data.update_csv(qd_df, database='QD', job_recipe=recipe, overwrite=overwrite,
-                        columns=[('settings', 'BDE 1')]+idx)
+        qd_df.sort_index(axis='columns', inplace=True)
+        column_tup = ('BDE label', 'BDE dE')
+        kwarg = {
+            'database': 'QD',
+            'job_recipe': get_recipe(j1, s1),
+            'overwrite': overwrite,
+            'columns': [('settings', 'BDE 1')]
+        }
+        kwarg['columns'] += [(i, j) for i, j in qd_df.columns if i in column_tup]
+        data.update_csv(qd_df, **kwarg)
+
+
+def get_recipe(job1, s1, job2=None, s2=None):
+    """Return the a dictionary with job types and job settings."""
+    ret = Settings()
+    value1 = qmflows.singlepoint['specific'][type_to_string(job1)].copy()
+    value1.update(s1)
+    ret['BDE 1'] = {'key': job1, 'value': value1}
+
+    if job2 is not None and s2 is not None:
+        value2 = qmflows.freq['specific'][type_to_string(job2)].copy()
+        value2.update(s2)
+        ret['BDE 2'] = {'key': job2, 'value': value2}
+
+    return ret
 
 
 def get_bde_dE(tot, lig, core, job=None, s=None):
-    """ Calculate the bond dissociation energy: dE = dE(mopac) + (dG(uff) - dE(uff))
-    """
+    """Calculate the bond dissociation energy: dE = dE(mopac) + (dG(uff) - dE(uff))."""
     # Optimize XYn
     if job == AMSJob:
         s_cp = s.copy()
@@ -283,14 +307,22 @@ def get_ligand(mol):
             else:
                 ret = Molecule()
                 ret.atoms = at_list
-                ret.bonds = set(chain.from_iterable(at.bonds for at in at_list))
+                ret.bonds = list(set(chain.from_iterable(at.bonds for at in at_list)))
                 return ret.copy()
 
     # Translate the ligands to their final position
     lig1 = get_ligand(mol)
     lig2 = lig1.copy()
     idx1, anchor1 = get_anchor(lig1)
     idx2, anchor2 = get_anchor(lig2)
+
+    # Return a the ligand without the ion
+    if ion is None:
+        lig1.properties.name = 'XYn'
+        lig1.properties.path = mol.properties.path
+        lig1.properties.indices = [idx1]
+        return lig1
+
     radius = anchor1.radius + PeriodicTable.get_radius(ion)
     target = np.array([radius, 0.0, 0.0])
     lig1.translate(anchor1.vector_to(target))
@@ -450,13 +482,14 @@ def remove_ligands(mol, combinations_dict, indices, idx_lig):
     for core in combinations_dict:
         for lig in combinations_dict[core]:
             mol_tmp = mol.copy()
+
             mol_tmp.properties = Settings()
             mol_tmp.properties.core_topology = str(mol[core].properties.topology) + '_' + str(core)
             mol_tmp.properties.lig_residue = sorted([mol[i[0]].properties.pdb_info.ResidueNumber
                                                      for i in lig])
             mol_tmp.properties.df_index = mol_tmp.properties.core_topology
-            mol_tmp.properties.df_index += ''.join([' ' + str(i)
-                                                    for i in mol_tmp.properties.lig_residue])
+            mol_tmp.properties.df_index += ' '.join(str(i) for i in mol_tmp.properties.lig_residue)
+
             delete_idx = sorted([core] + list(chain.from_iterable(lig)), reverse=True)
             for i in delete_idx:
                 mol_tmp.delete_atom(mol_tmp[i])

diff --git a/CAT/attachment/ligand_opt.py b/CAT/attachment/ligand_opt.py
@@ -3,7 +3,9 @@
 __all__ = ['init_ligand_opt']
 
 import itertools
+
 import numpy as np
+import pandas as pd
 
 from scm.plams.mol.molecule import Molecule
 from scm.plams.mol.atom import Atom
@@ -48,15 +50,15 @@ def init_ligand_opt(ligand_df, arg):
     if arg.optional.ligand.optimize:
         # Identify the to be optimized ligands
         if overwrite:
-            idx = ligand_df.index
+            idx = pd.Series(True, index=ligand_df.index, name='mol')
             message = '\t has been (re-)optimized'
         else:
             idx = ligand_df['hdf5 index'] < 0
             message = '\t has been optimized'
 
         # Optimize the ligands
         lig_new = []
-        for i, ligand in ligand_df['mol'][idx].iteritems():
+        for ligand in ligand_df['mol'][idx]:
             mol_list = split_mol(ligand)
             for mol in mol_list:
                 mol.set_dihed(180.0)

diff --git a/CAT/attachment/qd_opt.py b/CAT/attachment/qd_opt.py
@@ -2,6 +2,8 @@
 
 __all__ = ['init_qd_opt']
 
+import pandas as pd
+
 from scm.plams.core.settings import Settings
 from scm.plams.core.functions import (init, finish)
 from scm.plams.interfaces.adfsuite.ams import AMSJob
@@ -27,7 +29,7 @@ def init_qd_opt(qd_df, arg):
     job_recipe = arg.optional.qd.optimize
     overwrite = 'qd' in arg.optional.database.overwrite
     if overwrite:
-        idx = qd_df['hdf5 index']
+        idx = pd.Series(True, index=qd_df.index, name='mol')
         message = '\t has been (re-)optimized'
     else:
         idx = qd_df['hdf5 index'] < 0

diff --git a/CAT/data_handling/CAT_database.py b/CAT/data_handling/CAT_database.py
@@ -386,11 +386,12 @@ def __exit__(self, type, value, traceback):
             self.df = None
 
     class open_csv_qd():
-        """ Context manager for opening and closing the quantum dot database.
+        """Context manager for opening and closing the quantum dot database.
 
         :param str path: The path+filename to the database component.
         :param bool write: Whether or not the database file should be updated after
             closing **self**.
+
         """
 
         def __init__(self, path=None, write=True):
@@ -417,7 +418,16 @@ def __exit__(self, type, value, traceback):
             self.df = None
 
     class DF(dict):
-        """A mutable container for holding dataframes."""
+        """A mutable container for holding dataframes.
+
+        A subclass of :class:`dict` containing a single key (``"df"``) and value
+        (a Pandas DataFrame).
+        Calling an item or attribute of :class:`.DF` will call said method on the
+        underlaying DataFrame (``self["df"]``).
+        An exception to this is the ``"df"`` key, which will get/set the DataFrame
+        instead.
+
+        """
 
         def __init__(self, df: pd.DataFrame) -> None:
             super().__init__()
@@ -449,7 +459,7 @@ def __setitem__(self, key, value):
 
         def __getitem__(self, key):
             df = super().__getitem__('df')
-            if key == 'df':
+            if isinstance(key, str) and key == 'df':
                 return df
             return df.__getitem__(key)
 
@@ -574,7 +584,12 @@ def update_hdf5(self, df, database='ligand', overwrite=False):
 
             # If **overwrite** is *True*
             if overwrite and old.any():
-                f[database][old] = as_pdb_array(df['mol'][old.index], min_size=j)
+                ar = as_pdb_array(df['mol'][old.index], min_size=j)
+
+                # Ensure that the hdf5 indices are sorted
+                idx = np.argsort(old)
+                old = old[idx]
+                f[database][old] = ar[idx]
 
         return ret
 

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,14 @@ Change Log
 All notable changes to this project will be documented in this file.
 This project adheres to `Semantic Versioning <http://semver.org/>`_.
 
+0.4.3
+*****
+
+* Improved interaction between the database and BDE module
+* Cleaned up BDE module
+* HDF5 indices are now always sorted when itneraction with the database
+
+
 0.4.2
 *****
 

diff --git a/README.rst b/README.rst
@@ -7,7 +7,7 @@
    :target: https://www.python.org
 
 #######################################
-Compound Attachment/Analysis Tool 0.4.2
+Compound Attachment/Analysis Tool 0.4.3
 #######################################
 
 **CAT** is a collection of tools designed for the construction,

diff --git a/docs/conf.py b/docs/conf.py
@@ -65,7 +65,7 @@
 # The short X.Y version.
 version = '0.4'
 # The full version, including alpha/beta/rc tags.
-release = '0.4.2'
+release = '0.4.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.