From 850fb06eb0d9b368e4b09a86f17920d509ce5f91 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:16:56 +0100 Subject: [PATCH 01/50] First pass at porting @annamherz's analysis code. Lots of refactoring to remove code duplication and make ongoing maintanence easier. GROMACS analysis appears to work fine, although I am seeing very different values for the TI estimator. SOMD analysis is currently failing due to a spurious value in the input data. I'm not sure if this is to do with my test data, or a bug from the port. --- python/BioSimSpace/FreeEnergy/_relative.py | 1115 +++++++++++++++++--- 1 file changed, 948 insertions(+), 167 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 8de7839ca..715093ed0 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -26,19 +26,50 @@ __all__ = ["Relative", "getData"] -from collections import OrderedDict as _OrderedDict from glob import glob as _glob import copy as _copy import math as _math -import shlex as _shlex -import sys as _sys +import numpy as _np import os as _os +import pandas as _pd +import shlex as _shlex import shutil as _shutil import subprocess as _subprocess +import sys as _sys import warnings as _warnings import zipfile as _zipfile +from .._Utils import _assert_imported, _have_imported, _try_import + +# alchemlyb isn't available on all variants of Python that we support, so we +# need to try_import it. +_alchemlyb = _try_import("alchemlyb") + +if _have_imported(_alchemlyb): + # Handle alchemlyb MBAR API changes. + try: + from alchemlyb.estimators import AutoMBAR as _AutoMBAR + except ImportError: + from alchemlyb.estimators import MBAR as _AutoMBAR + from alchemlyb.estimators import TI as _TI + from alchemlyb.postprocessors.units import to_kcalmol as _to_kcalmol + from alchemlyb.parsing.amber import extract_dHdl as _amber_extract_dHdl + from alchemlyb.parsing.amber import extract_u_nk as _amber_extract_u_nk + from alchemlyb.parsing.gmx import extract_dHdl as _gmx_extract_dHdl + from alchemlyb.parsing.gmx import extract_u_nk as _gmx_extract_u_nk + from alchemlyb.preprocessing.subsampling import ( + equilibrium_detection as _equilibrium_detection, + ) + from alchemlyb.preprocessing.subsampling import ( + statistical_inefficiency as _statistical_inefficiency, + ) + from alchemlyb.preprocessing.subsampling import slicing as _slicing + from alchemlyb.preprocessing.subsampling import decorrelate_u_nk, decorrelate_dhdl + from alchemlyb.postprocessors.units import to_kcalmol as _to_kcalmol + from alchemlyb.postprocessors.units import kJ2kcal as _kJ2kcal + from alchemlyb.postprocessors.units import R_kJmol as _R_kJmol + from sire.legacy.Base import getBinDir as _getBinDir from sire.legacy.Base import getShareDir as _getShareDir @@ -46,7 +77,9 @@ from sire.legacy import Mol as _SireMol from .. import _gmx_exe +from .. import _gmx_version from .. import _is_notebook +from .. import _isVerbose from .._Exceptions import AnalysisError as _AnalysisError from .._Exceptions import MissingSoftwareError as _MissingSoftwareError from .._SireWrappers import Molecules as _Molecules @@ -280,9 +313,11 @@ def __init__( raise TypeError("'property_map' must be of type 'dict'") self._property_map = property_map - # Create fake instance methods for 'analyse' and 'difference'. These - # pass instance data through to the staticmethod versions. + # Create fake instance methods for 'analyse', 'checkOverlap', + # and 'difference'. These pass instance data through to the + # staticmethod versions. self.analyse = self._analyse + self.checkOverlap = self._checkOverlap self.difference = self._difference # Initialise the process runner. @@ -425,7 +460,7 @@ def getData(self, name="data", file_link=False, work_dir=None): return zipname @staticmethod - def analyse(work_dir): + def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): """ Analyse existing free-energy data from a simulation working directory. @@ -435,6 +470,12 @@ def analyse(work_dir): work_dir : str The working directory for the simulation. + estimator : str + The estimator to use for the free-energy analysis. ("MBAR" or "TI") + + method : str + The method to use for the free-energy analysis. ("alchemlyb" or "native") + Returns ------- @@ -443,10 +484,10 @@ def analyse(work_dir): where each tuple contains the lambda value, the PMF, and the standard error. - overlap : [ [ float, float, ... ] ] + overlap : np.matrix, None The overlap matrix. This gives the overlap between each lambda - window. This parameter is only computed for the SOMD engine and - will be None when GROMACS is used. + window. This parameter is only computed when available for the + specified estimator and engine, otherwise None will be returned. """ if not isinstance(work_dir, str): @@ -454,25 +495,373 @@ def analyse(work_dir): if not _os.path.isdir(work_dir): raise ValueError("'work_dir' doesn't exist!") - # First test for SOMD files. - data = _glob(work_dir + "/lambda_*/gradients.dat") + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + + if not isinstance(method, str): + raise TypeError("'method' must be of type 'str'.") + method = method.replace("-", "").upper() + + function_glob_dict = { + "SOMD": (Relative._analyse_somd, "/lambda_*/simfile.dat"), + "GROMACS": (Relative._analyse_gromacs, "/lambda_*/gromacs.xvg"), + "AMBER": (Relative._analyse_amber, "/lambda_*/amber.out"), + } + + for engine, (func, mask) in function_glob_dict.items(): + data = _glob(work_dir + mask) + if data and engine == "AMBER": + if method != "ALCHEMLYB": + raise _AnalysisError( + f"AMBER can only use the 'alchemlyb' analysis method." + ) + if data and engine == "SOMD" and estimator == "TI" and method == "native": + raise _AnalysisError( + "SOMD cannot use 'TI' estimator with 'native' analysis method." + ) + if data and engine == "GROMACS" and method == "native": + _warnings.warn( + "Native GROMACS analysis cannot do MBAR/TI. BAR will be used." + ) + if data: + return func(work_dir, estimator, method, **kwargs) - # SOMD. - if len(data) > 0: - return Relative._analyse_somd(work_dir) + raise ValueError("Couldn't find any SOMD, GROMACS or AMBER free-energy output?") + + def _analyse(self, estimator="MBAR"): + """ + Analyse free-energy data for this object. + + Parameters + ---------- + + estimator : str + The estimator to use for the free-energy analysis. ("MBAR" or "TI") + + Returns + ------- - # Now check for GROMACS output. + pmf : [(float, :class:`Energy `, :class:`Energy `)] + The potential of mean force (PMF). The data is a list of tuples, + where each tuple contains the lambda value, the PMF, and the + standard error. + + overlap : np.matrix, None + The overlap matrix. This gives the overlap between each lambda + window. This parameter is only computed when available for the + specified estimator and engine, otherwise None will be returned. + """ + + # Return the result of calling the staticmethod, passing in the working + # directory of this object and the specified estimator. + return Relative.analyse(str(self._work_dir), estimator=estimator) + + @staticmethod + def _somd_extract(simfile, T=None, estimator="MBAR"): + """ + Extract required data from a SOMD output file (simfile.dat). + + Parameters + ---------- + + simfile : str + Path to the simfile.dat file. + + T : float + Temperature in Kelvin at which the simulations were performed; + needed to generated the reduced potential (in units of kT). + + estimator : str + The estimator that the returned data will be used with. This can + be either 'MBAR' or 'TI'. + + Returns + ------- + + data : pandas.DataFrame + Either: Reduced potential for each alchemical state (k) for each + frame (n) for MBAR, or dH/dl as a function of time for this lambda + window for TI. + """ + + if not isinstance(simfile, str): + raise TypeError("'simfile' must be of type 'str'.") + if not _os.path.isfile(simfile): + raise ValueError("'simfile' doesn't exist!") + + if not isinstance(T, float): + raise TypeError("'T' must be of type 'float'.") + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + if estimator.replace(" ", "").upper() not in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + # Flag that we're using MBAR. + if estimator == "MBAR": + is_mbar = True else: - data = _glob(work_dir + "/lambda_*/gromacs.xvg") - if len(data) == 0: - raise ValueError( - "Couldn't find any SOMD or GROMACS free-energy output?" + is_mbar = False + + # For dhdl need to consider the temperature, as the gradient is in + # kcal/mol in the simfile.dat . + if is_mbar: + k_b = _R_kJmol * _kJ2kcal + beta = 1 / (k_b * T) + + # Flags to check if we have found the required records. + found_lambda = False + found_array = False + found_time = False + + # Process the file. + with open(simfile, "r") as f: + # Terms to search for in the record lines. + start_w = "#Generating lambda is" + start_a = "#Alchemical array is" + start_t = " and " + end_t = " ps" + # Read the file line-by-line. + for line in f.readlines(): + if start_w in line: + lambda_win = float(line.replace(start_w, "").strip()) + if lambda_win is not None: + found_lambda = True + if start_a in line: + lambda_array = ( + (line.replace(start_a, "")) + .strip() + .replace("(", "") + .replace(")", "") + .replace(" ", "") + ).split(",") + lambda_array = [float(lam) for lam in lambda_array] + if lambda_array is not None: + found_array = True + if start_t and end_t in line: + sim_length = float( + ((line.split(start_t)[1]).split(end_t)[0]).strip() + ) + if sim_length is not None: + found_time = True + # All records found, break the loop. + if found_lambda: + if found_array: + if found_time: + break + + if not found_lambda: + raise ValueError( + f"The lambda window was not detected in the SOMD output file: {file}" + ) + + if not found_array: + raise ValueError( + f"The lambda array was not detected in the SOMD output file: {file}" + ) + + if not found_time: + raise ValueError( + f"The simulation time was not detected in the SOMD output file: {file}" + ) + + # TODO: get header from the file instead of like this. + header = [ + "step", + "potential_kcal/mol", + "gradient_kcal/mol", + "forward_Metropolis", + "backward_Metropolis", + ] + header.extend(lambda_array) + + file_df = _pd.read_fwf( + simfile, skipinitialspace=True, skiprows=13, header=None, names=header + ) + + time_step = sim_length / len(file_df["step"]) + time_rows = _np.arange(0, len(file_df["step"]), 1) + time = _np.arange(0, sim_length, time_step) + + # For MBAR, results in list of lists where each list is the 0 to 1 + # window values that lambda value. For TI, it is a list of gradients + # at that lambda. + results = [] + if is_mbar: + # For the energies for each lambda window, append the kt to the + # data list of values for all lambda windows. + for t in time_rows: + row = file_df.loc[t][lambda_array].to_numpy() + E_ref = row[lambda_array.index(lambda_win)] + energies = [] + for lam in lambda_array: + E_ = row[lambda_array.index(lam)] + energies.append((E_ - E_ref)) + results.append(energies) + else: + gradient = file_df.loc[t]["gradient_kcal/mol"] + red_gradient = gradient * beta + results.append(red_gradient) + + # Turn into a dataframe that can be processed by alchemlyb. + if is_mbar: + with open("poo.pkl", "wb") as f: + pickle.dump(poo, f) + df = _pd.DataFrame( + results, + columns=_np.array(lambda_array, dtype=_np.float64), + index=_pd.MultiIndex.from_arrays( + [time, _np.repeat(lambda_win, len(time))], names=["time", "lambdas"] + ), + ) + else: + df = _pd.DataFrame( + results, + columns=["fep"], + index=_pd.MultiIndex.from_arrays( + [time, _np.repeat(lambda_win, len(time))], + names=["time", "fep-lambdas"], + ), + ) + df.attrs["temperature"] = T + df.attrs["energy_unit"] = "kT" + + return df + + @staticmethod + def _preprocess_data(data, estimator, **kwargs): + """ + Preprocess FEP simulation data. + + Parameters + ---------- + + data : list + List of extracted dHdl or u_nk data. + + estimator : str + The estimator ('MBAR' or 'TI') used. + + Returns + ------- + + processed_data : pandas.DataFrame + Dataframe of dHdl or u_nk data processed using automated equilibration + detection followed by statistical inefficiency. + """ + + if not isinstance(data, list): + raise TypeError("'data' must be of type 'list'.") + if not all(isinstance(x, float) for x in data): + raise TypeError("'data' must be a list of 'float' types.") + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + if not estimator.replace(" ", "").upper() in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + # Assign defaults in case not passed via kwargs. + auto_eq = False + stat_ineff = False + truncate = False + truncate_keep = "start" + + # Parse kwargs. + for key, value in kwargs.items(): + key = key.replace(" ", "").replace("_", "").upper() + if key == "AUTOEQUILIBRATION": + auto_eq = value + if key == "STATISTICALINEFFICIENCY": + stat_ineff = value + if key == "TRUNCATEPERCENTAGE": + truncate = value + if key == "TRUNCATEKEEP": + truncate_keep = value + + # First truncate data. + raw_data = data + if truncate: + # Get the upper and lower bounds for truncate. + data_len = len(data[0]) + data_step = round((data[0].index[-1][0] - data[0].index[-2][0]), 1) + data_kept = data_len * (truncate / 100) + data_time = data_kept * data_step + if truncate_keep == "start": + truncate_lower = 0 + truncate_upper = data_time - data_step + if truncate_keep == "end": + truncate_lower = (data_len * data_step) - data_time + truncate_upper = (data_len * data_step) - data_step + + try: + data = [ + _slicing(i, lower=truncate_lower, upper=truncate_upper) + for i in raw_data + ] + except: + _warnings.warn("Could not truncate data.") + data = raw_data + else: + data = raw_data + + # If using auto equilibration, do we want to remove burn in? + # Perform a statistical inefficiency check afterwards. + if stat_ineff: + if estimator == "MBAR": + decorrelated_data = [ + decorrelate_u_nk(i, method="dE", remove_burnin=auto_eq) + for i in data + ] + + elif estimator == "TI": + decorrelated_data = [ + decorrelate_dhdl(i, remove_burnin=auto_eq) for i in data + ] + + sampled_data = decorrelated_data + + for i in decorrelated_data: + if len(i.iloc[:, 0]) < 50: + _warnings.warn( + "Less than 50 samples as a result of preprocessing. No preprocessing will be performed." + ) + sampled_data = data + else: + # Need stats ineff for auto eq to run as well. + if auto_eq: + _warnings.warn( + "Auto equilibration can only be detected if statistical inefficiency is run as well." ) - return Relative._analyse_gromacs(work_dir) + sampled_data = data + + # Concatanate in alchemlyb format. + processed_data = _alchemlyb.concat(sampled_data) + + return processed_data - def _analyse(self): + @staticmethod + def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs): """ - Analyse free-energy data for this object. + Analyse existing free-energy data using MBAR and the alchemlyb library. + + Parameters + ---------- + + files : list + List of files for all lambda values to analyse. Should be sorted. + + temperatures : list + List of temperatures at which the simulation was carried out at for each lambda window. + Index of the temperature value should match it's corresponding lambda window index in files. + + lambdas : list + Sorted list of lambda values used for the simulation. + + engine : str + Engine with which the simulation was run. + + estimator : str + The estimator to use for the analysis. Options are "MBAR" or "TI". Returns ------- @@ -482,20 +871,134 @@ def _analyse(self): where each tuple contains the lambda value, the PMF, and the standard error. - overlap : [ [ float, float, ... ] ] + overlap : numpy.matrix, None The overlap matrix. This gives the overlap between each lambda - window. This parameter is only computed for the SOMD engine and - will be None when GROMACS is used. + window. Returns None if overlap isn't supported for the chosen + estimator or engine. """ - # Return the result of calling the staticmethod, passing in the working - # directory of this object. - return Relative.analyse(str(self._work_dir)) + if not isinstance(files, (tuple, list)): + raise TypeError("'files' must be of type 'list' or 'tuple'.") + if not all(isinstance(x, str) for x in files): + raise TypeError("'files' must be a list of 'str' types.") + + if not isinstance(temperatures, (tuple, list)): + raise TypeError("'temperatures' must be of type 'list' or 'tuple'.") + if not all(isinstance(x, float) for x in temperatures): + raise TypeError("'temperatures' must be a list of 'float' types.") + + if not isinstance(lambdas, (tuple, list)): + raise TypeError("'lambdas' must be of type 'list' or 'tuple'.") + if not all(isinstance(x, float) for x in lambdas): + raise TypeError("'lambdas' must be a list of 'float' types.") + + if not isinstance(engine, str): + raise TypeError("'engine' must be of type 'str'.") + if not engine.replace(" ", "").upper() in Relative._engines: + raise ValueError( + f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines)}" + ) + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + estimator = estimator.replace(" ", "").upper() + if not estimator in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if estimator == "MBAR": + is_mbar = True + else: + is_mbar = False + + from functools import partial + + function_glob_dict = { + "SOMD": ( + staticmethod(partial(Relative._somd_extract, estimator=estimator)) + ), + "GROMACS": (_gmx_extract_u_nk), + "AMBER": (_amber_extract_u_nk), + } + + # Extract the data. + func = function_glob_dict[engine] + try: + data = [func(x, T=t) for x, t in zip(files, temperatures)] + except Exception as e: + msg = "Could not extract the data from the provided files!" + if _isVerbose(): + raise _AnalysisError(msg) from e + else: + raise _AnalysisError(msg) from None + + # Preprocess the data. + try: + processed_data = Relative._preprocess_data(data, estimator, **kwargs) + except: + _warnings.warn("Could not preprocess the data!") + processed_data = _alchemlyb.concat(data) + + mbar_method = None + if is_mbar: + # Check kwargs incase there is an mbar_method and then use this + for key, value in kwargs.items(): + key = key.replace(" ", "").replace("_", "").upper() + if key == "MBARMETHOD": + mbar_method = value + + # MBAR analysis using a specified method. + if mbar_method: + try: + alchem = _AutoMBAR(method=mbar_method) + alchem.fit(processed_data) + except Exception as e: + msg = f"MBAR free-energy analysis failed with {mbar_method} as mbar_method!" + if _isVerbose(): + raise _AnalysisError(msg) from e + else: + raise _AnalysisError(msg) from None + # Standard analysis. + else: + try: + if is_mbar: + alchem = _AutoMBAR().fit(processed_data) + else: + alchem = _TI().fit(processed_data) + except Exception as e: + msg = f"{estimator} free-energy analysis failed!" + if _isVerbose(): + raise _AnalysisError(msg) from e + else: + raise _AnalysisError(msg) from None + + # Extract the data from the results. + data = [] + # Convert the data frames to kcal/mol. + delta_f_ = _to_kcalmol(alchem.delta_f_) + d_delta_f_ = _to_kcalmol(alchem.d_delta_f_) + for lambda_, t in zip(lambdas, temperatures): + x = lambdas.index(lambda_) + mbar_value = delta_f_.iloc[0, x] + mbar_error = d_delta_f_.iloc[0, x] + + # Append the data. + data.append( + ( + lambda_, + (mbar_value) * _Units.Energy.kcal_per_mol, + (mbar_error) * _Units.Energy.kcal_per_mol, + ) + ) + + if is_mbar: + return (data, alchem.overlap_matrix) + else: + return (data, None) @staticmethod - def _analyse_gromacs(work_dir=None): + def _analyse_amber(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs): """ - Analyse the GROMACS free energy data. + Analyse the AMBER free energy data. Parameters ---------- @@ -503,6 +1006,12 @@ def _analyse_gromacs(work_dir=None): work_dir : str The path to the working directory. + estimator : str + The estimator ('MBAR' or 'TI') used. + + method : str + The method used to analyse the data. Options are "alchemlyb" or "native". + Returns ------- @@ -510,89 +1019,226 @@ def _analyse_gromacs(work_dir=None): The potential of mean force (PMF). The data is a list of tuples, where each tuple contains the lambda value, the PMF, and the standard error. + + overlap or dHdl : numpy.matrix or alchemlyb.estimators.ti_.TI + For MBAR, this returns the overlap matrix for the overlap between + each lambda window. For TI, this returns None. """ - if not isinstance(work_dir, str): + if type(work_dir) is not str: raise TypeError("'work_dir' must be of type 'str'.") if not _os.path.isdir(work_dir): raise ValueError("'work_dir' doesn't exist!") - # Create the command. - xvg_files = _glob(f"{work_dir}/lambda_*/*.xvg") - command = "%s bar -f %s -o %s/bar.xvg" % ( - _gmx_exe, - " ".join(xvg_files), - work_dir, - ) + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + if estimator.replace(" ", "").upper() not in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if not isinstance(method, str): + raise TypeError("'method' must be of type 'str'.") + if method.replace(" ", "").upper() != "ALCHEMLYB": + raise ValueError( + "Only 'alchemyb' is supported as an analysis method for AMBER." + ) - # Run the first command. - proc = _subprocess.run( - _Utils.command_split(command), - shell=False, - stdout=_subprocess.PIPE, - stderr=_subprocess.PIPE, + # Find the output files and work out the lambda windows from the directory names. + files = sorted(_glob(work_dir + "/lambda_*/amber.out")) + lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + + # Find the temperature for each lambda window. + temperatures = [] + for file, lambda_ in zip(files, lambdas): + found_temperature = False + with open(file) as f: + for line in f.readlines(): + if not found_temperature: + match = _re.search(r"temp0=([\d.]+)", line) + if match is not None: + temperatures += [float(match.group(1))] + found_temperature = True + break + + if not found_temperature: + raise ValueError( + "The temperature was not detected in the AMBER output file." + ) + + if temperatures[0] != temperatures[-1]: + raise ValueError("The temperatures at the endstates don't match!") + + return Relative._analyse_internal( + files, temperatures, lambdas, "AMBER", estimator, **kwargs ) - if proc.returncode != 0: - raise _AnalysisError("GROMACS free-energy analysis failed!") - # Initialise list to hold the data. - data = [] + @staticmethod + def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs): + """ + Analyse GROMACS free energy data. - # Extract the data from the output files. + Parameters + ---------- - # First leg. - with open("%s/bar.xvg" % work_dir) as file: - # Read all of the lines into a list. - lines = [] - for line in file: - # Ignore comments and xmgrace directives. - if line[0] != "#" and line[0] != "@": - lines.append(line.rstrip()) + work_dir : str + The path to the working directory. - # Store the initial free energy reading. - data.append( - ( - 0.0, - 0.0 * _Units.Energy.kcal_per_mol, - 0.0 * _Units.Energy.kcal_per_mol, - ) + estimator : str + The estimator ('MBAR' or 'TI') used. + + method : str + The method used to analyse the data. Options are "alchemlyb" or "native". + + Returns + ------- + + pmf : [(float, :class:`Energy `, :class:`Energy `)] + The potential of mean force (PMF). The data is a list of tuples, + where each tuple contains the lambda value, the PMF, and the + standard error. + + overlap or dHdl : numpy.matrix or alchemlyb.estimators.ti_.TI + For MBAR, this returns the overlap matrix for the overlap between + each lambda window. For TI, this returns None. + """ + + if not isinstance(work_dir, str): + raise TypeError("'work_dir' must be of type 'str'.") + if not _os.path.isdir(work_dir): + raise ValueError("'work_dir' doesn't exist!") + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + if not estimator.replace(" ", "").upper() in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if not isinstance(method, str): + raise TypeError("'method' must be of type 'str'.") + method = method.replace(" ", "").upper() + if method not in ["ALCHEMLYB", "NATIVE"]: + raise ValueError("'method' must be either 'alchemlyb' or 'native'.") + + if _gmx_version <= 2020: + _warnings.warn( + "Analysing using 'native' gmx bar and BAR as the gromacs version is older..." ) + method = "NATIVE" + + if method == "ALCHEMLYB": + # Find the output files and work out the lambda windows from the directory names. + files = sorted(_glob(work_dir + "/lambda_*/gromacs.xvg")) + lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + + # Find the temperature at each lambda window + temperatures = [] + for file in files: + found_temperature = False + with open(file, "r") as f: + for line in f.readlines(): + t = None + start = "T =" + end = "(K)" + if start and end in line: + t = int(((line.split(start)[1]).split(end)[0]).strip()) + temperatures.append(float(t)) + if t is not None: + found_temperature = True + break + + if not found_temperature: + raise ValueError( + f"The temperature was not detected in the GROMACS output file, {file}" + ) - # Zero the accumulated error. - total_error = 0 + if temperatures[0] != temperatures[-1]: + raise ValueError("The temperatures at the endstates don't match!") - # Zero the accumulated free energy difference. - total_freenrg = 0 + return Relative._analyse_internal( + files, temperatures, lambdas, "GROMACS", estimator, **kwargs + ) - # Process the BAR data. - for x, line in enumerate(lines): - # Extract the data from the line. - records = line.split() + # For the older gromacs versions and native use the gmx bar analysis. + elif method == "NATIVE": + _warnings.warn( + "using 'native' for GROMACS does not return an overlap/dHdl." + ) + _warnings.warn("using 'native' for GROMACS uses BAR.") + + # Create the command. + xvg_files = _glob(f"{work_dir}/lambda_*/*.xvg") + command = "%s bar -f %s -o %s/bar.xvg" % ( + _gmx_exe, + " ".join(xvg_files), + work_dir, + ) + + # Run the first command. + proc = _subprocess.run( + _Utils.command_split(command), + shell=False, + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, + ) + if proc.returncode != 0: + raise _AnalysisError("Native GROMACS free-energy analysis failed!") - # Update the total free energy difference. - total_freenrg += float(records[1]) + # Initialise list to hold the data. + data = [] - # Extract the error. - error = float(records[2]) + # Extract the data from the output files. - # Update the accumulated error. - total_error = _math.sqrt(total_error * total_error + error * error) + # First leg. + with open("%s/bar.xvg" % work_dir) as file: + # Read all of the lines into a list. + lines = [] + for line in file: + # Ignore comments and xmgrace directives. + if line[0] != "#" and line[0] != "@": + lines.append(line.rstrip()) - # Append the data. + # Store the initial free energy reading. data.append( ( - (x + 1) / (len(lines)), - (total_freenrg * _Units.Energy.kt).kcal_per_mol(), - (total_error * _Units.Energy.kt).kcal_per_mol(), + 0.0, + 0.0 * _Units.Energy.kcal_per_mol, + 0.0 * _Units.Energy.kcal_per_mol, ) ) - return (data, None) + # Zero the accumulated error. + total_error = 0 + + # Zero the accumulated free energy difference. + total_freenrg = 0 + + # Process the BAR data. + for x, line in enumerate(lines): + # Extract the data from the line. + records = line.split() + + # Update the total free energy difference. + total_freenrg += float(records[1]) + + # Extract the error. + error = float(records[2]) + + # Update the accumulated error. + total_error = _math.sqrt(total_error * total_error + error * error) + + # Append the data. + data.append( + ( + (x + 1) / (len(lines)), + (total_freenrg * _Units.Energy.kt).kcal_per_mol(), + (total_error * _Units.Energy.kt).kcal_per_mol(), + ) + ) + + return (data, None) @staticmethod - def _analyse_somd(work_dir=None): + def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs): """ - Analyse the SOMD free energy data. + Analyse SOMD free energy data. Parameters ---------- @@ -600,6 +1246,12 @@ def _analyse_somd(work_dir=None): work_dir : str The path to the working directory. + estimator : str + The estimator ('MBAR' or 'TI') used. + + method : str + The method used to analyse the data. Options are "alchemlyb" or "native". + Returns ------- @@ -608,10 +1260,9 @@ def _analyse_somd(work_dir=None): where each tuple contains the lambda value, the PMF, and the standard error. - overlap : [ [ float, float, ... ] ] - The overlap matrix. This gives the overlap between each lambda - window. This parameter is only computed for the SOMD engine and - will be None when GROMACS is used. + overlap or dHdl : numpy.matrix or alchemlyb.estimators.ti_.TI + For MBAR, this returns the overlap matrix for the overlap between + each lambda window. For TI, this returns None. """ if not isinstance(work_dir, str): @@ -619,99 +1270,229 @@ def _analyse_somd(work_dir=None): if not _os.path.isdir(work_dir): raise ValueError("'work_dir' doesn't exist!") - # Create the command. - command = ( - "%s mbar -i %s/lambda_*/simfile.dat -o %s/mbar.txt --overlap --subsampling" - % (_analyse_freenrg, work_dir, work_dir) - ) - - # Run the first command. - proc = _subprocess.run( - _Utils.command_split(command), - shell=False, - stdout=_subprocess.PIPE, - stderr=_subprocess.PIPE, - ) - if proc.returncode != 0: - raise _AnalysisError("SOMD free-energy analysis failed!") - - # Re-run without subsampling if the subsampling has resulted in less than 50 samples. - with open("%s/mbar.txt" % work_dir) as file: - for line in file: - if ( - "#WARNING SUBSAMPLING ENERGIES RESULTED IN LESS THAN 50 SAMPLES" - in line - ): - _warnings.warn( - "Subsampling resulted in less than 50 samples, " - f"re-running without subsampling for '{work_dir}'" - ) - command = ( - "%s mbar -i %s/lambda_*/simfile.dat -o %s/mbar.txt --overlap" - % (_analyse_freenrg, work_dir, work_dir) - ) - proc = _subprocess.run( - _Utils.command_split(command), - shell=False, - stdout=_subprocess.PIPE, - stderr=_subprocess.PIPE, + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + estimator = estimator.replace(" ", "").upper() + if estimator not in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if not isinstance(method, str): + raise TypeError("'method' must be of type 'str'.") + method = method.replace(" ", "").upper() + if not method in ["ALCHEMLYB", "native"]: + raise ValueError("'method' must be either 'alchemlyb' or 'native'.") + + if method == "ALCHEMLYB": + # Glob the data files and work out the lambda values. + files = sorted(_glob(work_dir + "/lambda_*/simfile.dat")) + lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + + temperatures = [] + for file in files: + found_temperature = False + with open(file, "r") as f: + for line in f.readlines(): + temp = None + start = "#Generating temperature is" + if start in line: + split_line = (line.split(start)[1]).strip().split(" ") + temp = split_line[0] + unit = split_line[-1] + if unit.upper() == "C": + temp = float(temp) + 273.15 + else: + temp = float(temp) + temperatures.append(temp) + if temp is not None: + found_temperature = True + break + + if not found_temperature: + raise ValueError( + f"The temperature was not detected in the SOMD output file: {file}" ) - if proc.returncode != 0: - raise _AnalysisError("SOMD free-energy analysis failed!") - break - # Initialise list to hold the data. - data = [] + if temperatures[0] != temperatures[-1]: + raise ValueError("The temperatures at the endstates don't match!") - # Initialise list to hold the overlap matrix. - overlap = [] + return Relative._analyse_internal( + files, temperatures, lambdas, "SOMD", estimator, **kwargs + ) - # Extract the data from the output files. + elif method == "NATIVE": + # Create the command. + command = ( + "%s mbar -i %s/lambda_*/simfile.dat -o %s/mbar.txt --overlap --percent 5" + % (_analyse_freenrg, work_dir, work_dir) + ) - # First leg. - with open("%s/mbar.txt" % work_dir) as file: - # Process the MBAR data. - for line in file: - # Process the overlap matrix. - if "#Overlap matrix" in line: - # Get the next row. - row = next(file) + # Run the first command. + proc = _subprocess.run( + _Utils.command_split(command), + shell=False, + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, + ) + if proc.returncode != 0: + raise _AnalysisError("Native SOMD free-energy analysis failed!") + + # Re-run without subsampling if the subsampling has resulted in less than 50 samples. + with open("%s/mbar.txt" % work_dir) as file: + for line in file: + if ( + "#WARNING SUBSAMPLING ENERGIES RESULTED IN LESS THAN 50 SAMPLES" + in line + ): + _warnings.warn( + "Subsampling resulted in less than 50 samples, " + f"re-running without subsampling for '{work_dir}'" + ) + command = ( + "%s mbar -i %s/lambda_*/simfile.dat -o %s/mbar.txt --overlap" + % (_analyse_freenrg, work_dir, work_dir) + ) + proc = _subprocess.run( + _shlex.split(command), + shell=False, + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, + ) + if proc.returncode != 0: + raise _AnalysisError("SOMD free-energy analysis failed!") + break - # Loop until we hit the next section. - while not row.startswith("#DG"): - # Extract the data for this row. - records = [float(x) for x in row.split()] + # Initialise list to hold the data. + data = [] - # Append to the overlap matrix. - overlap.append(records) + # Initialise list to hold the overlap matrix. + overlap = [] - # Get the next line. + # Extract the data from the output files. + + # First leg. + with open("%s/mbar.txt" % work_dir) as file: + # Process the MBAR data. + for line in file: + # Process the overlap matrix. + if "#Overlap matrix" in line: + # Get the next row. row = next(file) - # Process the PMF. - elif "PMF from MBAR" in line: - # Get the next row. - row = next(file) - - # Loop until we hit the next section. - while not row.startswith("#TI"): - # Split the line. - records = row.split() - - # Append the data. - data.append( - ( - float(records[0]), - float(records[1]) * _Units.Energy.kcal_per_mol, - float(records[2]) * _Units.Energy.kcal_per_mol, - ) - ) + # Loop until we hit the next section. + while not row.startswith("#DG"): + # Extract the data for this row. + records = [float(x) for x in row.split()] + + # Append to the overlap matrix. + overlap.append(records) + + # Get the next line. + row = next(file) - # Get the next line. + # Process the PMF. + elif "PMF from MBAR" in line: + # Get the next row. row = next(file) + # Loop until we hit the next section. + while not row.startswith("#TI"): + # Split the line. + records = row.split() + + # Append the data. + data.append( + ( + float(records[0]), + float(records[1]) * _Units.Energy.kcal_per_mol, + float(records[2]) * _Units.Energy.kcal_per_mol, + ) + ) + + # Get the next line. + row = next(file) + return (data, overlap) + @staticmethod + def checkOverlap(overlap, threshold=0.03): + """ + Check the overlap of an FEP leg. + + Parameters + ---------- + + overlap : numpy.matrix + The overlap matrix. This gives the overlap between lambda windows. + + threshold : float + The threshold value used to check the off-diagonals. + + Returns + ------- + + is_okay : boolean + True if the overlap is okay, False if any off-diagonals are less + than the threshold value. + + num_low : int + The number of off-diagonals that are less than the threshold value. + + """ + if not isinstance(overlap, _np.matrix): + raise TypeError("'overlap' must be of type 'numpy.matrix'.") + + if not isinstance(threshold, float): + raise TypeError("'threshold' must be of type 'float'.") + + if threshold < 0.0 or threshold > 1.0: + raise ValueError("'threshold' must be between 0 and 1.") + + # Get all off diagonal elements. + off_diagonal = (_np.diagonal(overlap, 1)).tolist() + for x in (_np.diagonal(overlap, -1)).tolist(): + off_diagonal.append(x) + + # Check if any off diagonals are less than the threshold value. + num_low = 0 + is_okay = False + for od in off_diagonal: + if od < threshold: + num_low += 1 + if num_low > 0: + _warnings.warn( + f"Overlap is poor: {num_low} off-diagonals are less than {threshold}." + ) + else: + is_okay = True + + return is_okay, num_low + + def _checkOverlap(self, estimator="MBAR"): + """ + Check the overlap of an FEP simulation leg. + + Parameters + ---------- + + estimator : str + The estimator used for the free-energy analysis. ("MBAR" or "TI") + + Returns + ------- + + overlap_okay : boolean + True if the overlap is okay, False if any off-diagonals are less than 0.03. + + """ + + # Calculate the overlap for this object. + _, overlap = self.analyse(estimator=estimator) + + if overlap: + return Relative.checkOverlap(overlap, estimator=self._estimator) + else: + raise ValueError("Overlap matrix isn't supported for this estimator.") + @staticmethod def difference(pmf, pmf_ref): """ From 8d98b6a3d3f14ddff7174d5f439fe9cdc302c874 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:21:49 +0100 Subject: [PATCH 02/50] Assert alchemlyb is imported when used as analysis method. --- python/BioSimSpace/FreeEnergy/_relative.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 715093ed0..aa0f94476 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -502,6 +502,9 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): raise TypeError("'method' must be of type 'str'.") method = method.replace("-", "").upper() + if method == "ALCHEMLYB": + _assert_imported("alchemlyb") + function_glob_dict = { "SOMD": (Relative._analyse_somd, "/lambda_*/simfile.dat"), "GROMACS": (Relative._analyse_gromacs, "/lambda_*/gromacs.xvg"), From 6f84111c9b821a39b57d4cd1434fcd6d03e55351 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:23:15 +0100 Subject: [PATCH 03/50] Use internal command_split utility function. --- python/BioSimSpace/FreeEnergy/_relative.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index aa0f94476..2a715f839 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -33,7 +33,6 @@ import numpy as _np import os as _os import pandas as _pd -import shlex as _shlex import shutil as _shutil import subprocess as _subprocess import sys as _sys @@ -1355,7 +1354,7 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) % (_analyse_freenrg, work_dir, work_dir) ) proc = _subprocess.run( - _shlex.split(command), + _Util.command_split(command), shell=False, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE, From a623f59f4b08d55e8d33191deca3187be12f25aa Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:27:17 +0100 Subject: [PATCH 04/50] Expose threshold to instance checkOverlap method. --- python/BioSimSpace/FreeEnergy/_relative.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 2a715f839..08f36a307 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1445,7 +1445,6 @@ def checkOverlap(overlap, threshold=0.03): if not isinstance(threshold, float): raise TypeError("'threshold' must be of type 'float'.") - if threshold < 0.0 or threshold > 1.0: raise ValueError("'threshold' must be between 0 and 1.") @@ -1469,7 +1468,7 @@ def checkOverlap(overlap, threshold=0.03): return is_okay, num_low - def _checkOverlap(self, estimator="MBAR"): + def _checkOverlap(self, estimator="MBAR", threshold=0.03): """ Check the overlap of an FEP simulation leg. @@ -1479,19 +1478,25 @@ def _checkOverlap(self, estimator="MBAR"): estimator : str The estimator used for the free-energy analysis. ("MBAR" or "TI") + threshold : float + The threshold value used to check the off-diagonals. + Returns ------- - overlap_okay : boolean - True if the overlap is okay, False if any off-diagonals are less than 0.03. + is_okay : boolean + True if the overlap is okay, False if any off-diagonals are less + than the threshold value. + num_low : int + The number of off-diagonals that are less than the threshold value. """ # Calculate the overlap for this object. _, overlap = self.analyse(estimator=estimator) if overlap: - return Relative.checkOverlap(overlap, estimator=self._estimator) + return Relative.checkOverlap(overlap, threshold=threshold) else: raise ValueError("Overlap matrix isn't supported for this estimator.") From aaf4f63cf4245afd7819cbf07543af169a2c6176 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:44:54 +0100 Subject: [PATCH 05/50] Remove debug pickle file. --- python/BioSimSpace/FreeEnergy/_relative.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 08f36a307..b982c51a0 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -707,8 +707,6 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): # Turn into a dataframe that can be processed by alchemlyb. if is_mbar: - with open("poo.pkl", "wb") as f: - pickle.dump(poo, f) df = _pd.DataFrame( results, columns=_np.array(lambda_array, dtype=_np.float64), From 5742d6759f61505375da623adacc4b77c64494f6 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:51:09 +0100 Subject: [PATCH 06/50] Fix indentation of conditional block. --- python/BioSimSpace/FreeEnergy/_relative.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index b982c51a0..a4350f7a2 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -700,10 +700,10 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): E_ = row[lambda_array.index(lam)] energies.append((E_ - E_ref)) results.append(energies) - else: - gradient = file_df.loc[t]["gradient_kcal/mol"] - red_gradient = gradient * beta - results.append(red_gradient) + else: + gradient = file_df.loc[t]["gradient_kcal/mol"] + red_gradient = gradient * beta + results.append(red_gradient) # Turn into a dataframe that can be processed by alchemlyb. if is_mbar: From 06b396498f728f70cd409c60692bf566d13b6673 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:51:23 +0100 Subject: [PATCH 07/50] Fix case in string comparison. --- python/BioSimSpace/FreeEnergy/_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index a4350f7a2..ffce415c8 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1279,7 +1279,7 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) if not isinstance(method, str): raise TypeError("'method' must be of type 'str'.") method = method.replace(" ", "").upper() - if not method in ["ALCHEMLYB", "native"]: + if not method in ["ALCHEMLYB", "NATIVE"]: raise ValueError("'method' must be either 'alchemlyb' or 'native'.") if method == "ALCHEMLYB": From 8a231977e857e942325339adb2d7bad46c305482 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 15:55:43 +0100 Subject: [PATCH 08/50] Fix extraction of SOMD TI data. --- python/BioSimSpace/FreeEnergy/_relative.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index ffce415c8..aaceae7c3 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -607,7 +607,7 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): # For dhdl need to consider the temperature, as the gradient is in # kcal/mol in the simfile.dat . - if is_mbar: + if not is_mbar: k_b = _R_kJmol * _kJ2kcal beta = 1 / (k_b * T) @@ -701,9 +701,10 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): energies.append((E_ - E_ref)) results.append(energies) else: - gradient = file_df.loc[t]["gradient_kcal/mol"] - red_gradient = gradient * beta - results.append(red_gradient) + for t in time_rows: + gradient = file_df.loc[t]["gradient_kcal/mol"] + red_gradient = gradient * beta + results.append(red_gradient) # Turn into a dataframe that can be processed by alchemlyb. if is_mbar: From 70c10bc2d8a5206ed8e8348425d4037678a441ef Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 16:12:48 +0100 Subject: [PATCH 09/50] Handle different data type for alchemlyb 2+. --- python/BioSimSpace/FreeEnergy/_relative.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index aaceae7c3..8c15a6ef8 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -751,10 +751,8 @@ def _preprocess_data(data, estimator, **kwargs): detection followed by statistical inefficiency. """ - if not isinstance(data, list): - raise TypeError("'data' must be of type 'list'.") - if not all(isinstance(x, float) for x in data): - raise TypeError("'data' must be a list of 'float' types.") + if not isinstance(data, (list, _pd.DataFrame)): + raise TypeError("'data' must be of type 'list' or 'pandas.DataFrame'.") if not isinstance(estimator, str): raise TypeError("'estimator' must be of type 'str'.") From b906f700293af970cb75c4ac725c7daa6b390c47 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 16:32:58 +0100 Subject: [PATCH 10/50] Fix use of alchemlyb extract functions following refactor. --- python/BioSimSpace/FreeEnergy/_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 8c15a6ef8..75ba2df2a 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -915,8 +915,8 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) "SOMD": ( staticmethod(partial(Relative._somd_extract, estimator=estimator)) ), - "GROMACS": (_gmx_extract_u_nk), - "AMBER": (_amber_extract_u_nk), + "GROMACS": (_gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl), + "AMBER": (_amber_extract_u_nk if is_mbar else _amber_extract_dHdl), } # Extract the data. From 53d8fe24f14aa37395aee7bd4983fd68dad2508e Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 10 Aug 2023 16:33:59 +0100 Subject: [PATCH 11/50] Grammar tweak. --- python/BioSimSpace/FreeEnergy/_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 75ba2df2a..c7dd660e1 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -41,7 +41,7 @@ from .._Utils import _assert_imported, _have_imported, _try_import -# alchemlyb isn't available on all variants of Python that we support, so we +# alchemlyb isn't available for all variants of Python that we support, so we # need to try_import it. _alchemlyb = _try_import("alchemlyb") From e757cad1bca7ecf433e59f6994ce04db55e13273 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 09:03:14 +0100 Subject: [PATCH 12/50] Reorder member functions. --- python/BioSimSpace/FreeEnergy/_relative.py | 294 ++++++++++----------- 1 file changed, 147 insertions(+), 147 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index c7dd660e1..f6ee82cdb 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -530,6 +530,153 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): raise ValueError("Couldn't find any SOMD, GROMACS or AMBER free-energy output?") + @staticmethod + def checkOverlap(overlap, threshold=0.03): + """ + Check the overlap of an FEP leg. + + Parameters + ---------- + + overlap : numpy.matrix + The overlap matrix. This gives the overlap between lambda windows. + + threshold : float + The threshold value used to check the off-diagonals. + + Returns + ------- + + is_okay : boolean + True if the overlap is okay, False if any off-diagonals are less + than the threshold value. + + num_low : int + The number of off-diagonals that are less than the threshold value. + + """ + if not isinstance(overlap, _np.matrix): + raise TypeError("'overlap' must be of type 'numpy.matrix'.") + + if not isinstance(threshold, float): + raise TypeError("'threshold' must be of type 'float'.") + if threshold < 0.0 or threshold > 1.0: + raise ValueError("'threshold' must be between 0 and 1.") + + # Get all off diagonal elements. + off_diagonal = (_np.diagonal(overlap, 1)).tolist() + for x in (_np.diagonal(overlap, -1)).tolist(): + off_diagonal.append(x) + + # Check if any off diagonals are less than the threshold value. + num_low = 0 + is_okay = False + for od in off_diagonal: + if od < threshold: + num_low += 1 + if num_low > 0: + _warnings.warn( + f"Overlap is poor: {num_low} off-diagonals are less than {threshold}." + ) + else: + is_okay = True + + return is_okay, num_low + + @staticmethod + def difference(pmf, pmf_ref): + """ + Compute the relative free-energy difference between two perturbation + legs. + + Parameters + ---------- + + pmf : [(float, :class:`Energy `, :class:`Energy `)] + The potential of mean force (PMF) of interest. The data is a list + of tuples, where each tuple contains the lambda value, the PMF, + and the standard error. + + pmf_ref : [(float, :class:`Energy `, :class:`Energy `)] + The reference potential of mean force (PMF). The data is a list + of tuples, where each tuple contains the lambda value, the PMF, + and the standard error. + + Returns + ------- + + free_energy : (:class:`Energy `, :class:`Energy `) + The relative free-energy difference and its associated error. + """ + + if not isinstance(pmf, list): + raise TypeError("'pmf' must be of type 'list'.") + if not isinstance(pmf_ref, list): + raise TypeError("'pmf_ref' must be of type 'list'.") + + for rec in pmf: + if not isinstance(rec, tuple): + raise TypeError( + "'pmf1' must contain tuples containing lambda " + "values and the associated free-energy and error." + ) + else: + if len(rec) != 3: + raise ValueError( + "Each tuple in 'pmf1' must contain three items: " + "a lambda value and the associated free energy " + "and error." + ) + for val in rec[1:]: + if not isinstance(val, _Types.Energy): + raise TypeError( + "'pmf' must contain 'BioSimSpace.Types.Energy' types." + ) + + for rec in pmf_ref: + if not isinstance(rec, tuple): + raise TypeError( + "'pmf_ref' must contain tuples containing lambda " + "values and the associated free-energy and error." + ) + else: + if len(rec) != 3: + raise ValueError( + "Each tuple in 'pmf_ref' must contain three items: " + "a lambda value and the associated free energy " + "and error." + ) + for val in rec[1:]: + if not isinstance(val, _Types.Energy): + raise TypeError( + "'pmf_ref' must contain 'BioSimSpace.Types.Energy' types." + ) + + # Work out the difference in free energy. + free_energy = (pmf[-1][1] - pmf[0][1]) - (pmf_ref[-1][1] - pmf_ref[0][1]) + + # Propagate the errors. (These add in quadrature.) + + # Measure. + error0 = _math.sqrt( + (pmf[-1][2].value() * pmf[-1][2].value()) + + (pmf[0][2].value() * pmf[0][2].value()) + ) + + # Reference. + error1 = _math.sqrt( + (pmf_ref[-1][2].value() * pmf_ref[-1][2].value()) + + (pmf_ref[0][2].value() * pmf_ref[0][2].value()) + ) + + # Error for free-energy difference. + error = ( + _math.sqrt((error0 * error0) + (error1 * error1)) + * _Units.Energy.kcal_per_mol + ) + + return (free_energy, error) + def _analyse(self, estimator="MBAR"): """ Analyse free-energy data for this object. @@ -1412,59 +1559,6 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) return (data, overlap) - @staticmethod - def checkOverlap(overlap, threshold=0.03): - """ - Check the overlap of an FEP leg. - - Parameters - ---------- - - overlap : numpy.matrix - The overlap matrix. This gives the overlap between lambda windows. - - threshold : float - The threshold value used to check the off-diagonals. - - Returns - ------- - - is_okay : boolean - True if the overlap is okay, False if any off-diagonals are less - than the threshold value. - - num_low : int - The number of off-diagonals that are less than the threshold value. - - """ - if not isinstance(overlap, _np.matrix): - raise TypeError("'overlap' must be of type 'numpy.matrix'.") - - if not isinstance(threshold, float): - raise TypeError("'threshold' must be of type 'float'.") - if threshold < 0.0 or threshold > 1.0: - raise ValueError("'threshold' must be between 0 and 1.") - - # Get all off diagonal elements. - off_diagonal = (_np.diagonal(overlap, 1)).tolist() - for x in (_np.diagonal(overlap, -1)).tolist(): - off_diagonal.append(x) - - # Check if any off diagonals are less than the threshold value. - num_low = 0 - is_okay = False - for od in off_diagonal: - if od < threshold: - num_low += 1 - if num_low > 0: - _warnings.warn( - f"Overlap is poor: {num_low} off-diagonals are less than {threshold}." - ) - else: - is_okay = True - - return is_okay, num_low - def _checkOverlap(self, estimator="MBAR", threshold=0.03): """ Check the overlap of an FEP simulation leg. @@ -1497,100 +1591,6 @@ def _checkOverlap(self, estimator="MBAR", threshold=0.03): else: raise ValueError("Overlap matrix isn't supported for this estimator.") - @staticmethod - def difference(pmf, pmf_ref): - """ - Compute the relative free-energy difference between two perturbation - legs. - - Parameters - ---------- - - pmf : [(float, :class:`Energy `, :class:`Energy `)] - The potential of mean force (PMF) of interest. The data is a list - of tuples, where each tuple contains the lambda value, the PMF, - and the standard error. - - pmf_ref : [(float, :class:`Energy `, :class:`Energy `)] - The reference potential of mean force (PMF). The data is a list - of tuples, where each tuple contains the lambda value, the PMF, - and the standard error. - - Returns - ------- - - free_energy : (:class:`Energy `, :class:`Energy `) - The relative free-energy difference and its associated error. - """ - - if not isinstance(pmf, list): - raise TypeError("'pmf' must be of type 'list'.") - if not isinstance(pmf_ref, list): - raise TypeError("'pmf_ref' must be of type 'list'.") - - for rec in pmf: - if not isinstance(rec, tuple): - raise TypeError( - "'pmf1' must contain tuples containing lambda " - "values and the associated free-energy and error." - ) - else: - if len(rec) != 3: - raise ValueError( - "Each tuple in 'pmf1' must contain three items: " - "a lambda value and the associated free energy " - "and error." - ) - for val in rec[1:]: - if not isinstance(val, _Types.Energy): - raise TypeError( - "'pmf' must contain 'BioSimSpace.Types.Energy' types." - ) - - for rec in pmf_ref: - if not isinstance(rec, tuple): - raise TypeError( - "'pmf_ref' must contain tuples containing lambda " - "values and the associated free-energy and error." - ) - else: - if len(rec) != 3: - raise ValueError( - "Each tuple in 'pmf_ref' must contain three items: " - "a lambda value and the associated free energy " - "and error." - ) - for val in rec[1:]: - if not isinstance(val, _Types.Energy): - raise TypeError( - "'pmf_ref' must contain 'BioSimSpace.Types.Energy' types." - ) - - # Work out the difference in free energy. - free_energy = (pmf[-1][1] - pmf[0][1]) - (pmf_ref[-1][1] - pmf_ref[0][1]) - - # Propagate the errors. (These add in quadrature.) - - # Measure. - error0 = _math.sqrt( - (pmf[-1][2].value() * pmf[-1][2].value()) - + (pmf[0][2].value() * pmf[0][2].value()) - ) - - # Reference. - error1 = _math.sqrt( - (pmf_ref[-1][2].value() * pmf_ref[-1][2].value()) - + (pmf_ref[0][2].value() * pmf_ref[0][2].value()) - ) - - # Error for free-energy difference. - error = ( - _math.sqrt((error0 * error0) + (error1 * error1)) - * _Units.Energy.kcal_per_mol - ) - - return (free_energy, error) - def _difference(self, pmf_ref): """ Compute the relative free-energy difference between two perturbation From 2787f908a1c9c56a5fcdc56f4c86901fcfea17bc Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 09:30:20 +0100 Subject: [PATCH 13/50] Silence excessive pymbar warnings on startup. --- python/BioSimSpace/FreeEnergy/_relative.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index f6ee82cdb..a971eca8b 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -46,6 +46,12 @@ _alchemlyb = _try_import("alchemlyb") if _have_imported(_alchemlyb): + import logging as _logging + + # Silence pymbar warnings on startup. + _logger = _logging.getLogger("pymbar") + _logger.setLevel(_logging.ERROR) + # Handle alchemlyb MBAR API changes. try: from alchemlyb.estimators import AutoMBAR as _AutoMBAR From 79856c190ba0531ea033feb72904f3790679b366 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 10:50:27 +0100 Subject: [PATCH 14/50] Add FEP analysis unit test. --- tests/FreeEnergy/test_relative.py | 72 ++++++++++++++++++++++++++++++- tests/conftest.py | 11 ++--- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/tests/FreeEnergy/test_relative.py b/tests/FreeEnergy/test_relative.py index f93421da7..2d32fb0d9 100644 --- a/tests/FreeEnergy/test_relative.py +++ b/tests/FreeEnergy/test_relative.py @@ -1,11 +1,15 @@ +import math import pytest +import requests +import tarfile +import tempfile import BioSimSpace as BSS -from tests.conftest import url, has_gromacs +from tests.conftest import url, has_alchemlyb, has_gromacs -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def perturbable_system(): """Re-use the same perturbable system for each test.""" return BSS.IO.readPerturbableSystem( @@ -16,6 +20,45 @@ def perturbable_system(): ) +@pytest.fixture(scope="module") +def fep_output(): + """Path to a temporary directory containing FEP output.""" + + # Create URL to test test data. + fep_url = f"{url}/fep_output.tar.gz" + + # Create a temporary directory. + tmp_dir = tempfile.TemporaryDirectory() + + # Create the name of the local file. + local_file = tmp_dir.name + "/fep_output" + + # Download the test data. + req = requests.get(fep_url, stream=True) + with open(local_file + ".tar.gz", "wb") as f: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + f.flush() + + # Now extract the archive. + tar = tarfile.open(local_file + ".tar.gz") + tar.extractall(tmp_dir.name) + tar.close() + + return tmp_dir + + +@pytest.fixture(scope="module") +def expected_results(): + """A dictionary of expected FEP results.""" + + return { + "somd": {"mbar": -6.3519, "ti": -10.6027}, + "gromacs": {"mbar": -6.0238, "ti": -8.4158}, + } + + def test_setup_somd(perturbable_system): """Test setup for a relative alchemical free energy leg using SOMD.""" free_nrg = BSS.FreeEnergy.Relative(perturbable_system, engine="somd") @@ -25,3 +68,28 @@ def test_setup_somd(perturbable_system): def test_setup_gromacs(perturbable_system): """Test setup for a relative alchemical free energy leg using GROMACS.""" free_nrg = BSS.FreeEnergy.Relative(perturbable_system, engine="gromacs") + + +@pytest.mark.skipif( + has_alchemlyb is False, reason="Requires alchemlyb to be installed." +) +@pytest.mark.parametrize("engine", ["somd", "gromacs"]) +@pytest.mark.parametrize("estimator", ["mbar", "ti"]) +def test_analysis(fep_output, engine, estimator, expected_results): + """Test that the free energy analysis works as expected.""" + + # Create the paths to the data. + path_free = f"{fep_output.name}/fep_output/{engine}/free" + path_vac = f"{fep_output.name}/fep_output/{engine}/vacuum" + + # Perform the analysis. + free_nrg_free, _ = BSS.FreeEnergy.Relative.analyse(path_free, estimator=estimator) + free_nrg_vac, _ = BSS.FreeEnergy.Relative.analyse(path_vac, estimator=estimator) + + # Compute the free energy difference. + delta_g = BSS.FreeEnergy.Relative.difference(free_nrg_free, free_nrg_vac) + + # Make sure the result is close to the expected value. + assert math.isclose( + delta_g[0].value(), expected_results[engine][estimator], rel_tol=1e-4 + ) diff --git a/tests/conftest.py b/tests/conftest.py index 9c547f7c3..0942bc736 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,11 +47,6 @@ mdtraj = _try_import("mdtraj") has_mdtraj = _have_imported(mdtraj) - -# Check for MDRestraintsGenerator. -MDRestraintsGenerator = _try_import( - "MDRestraintsGenerator", - install_command="pip install MDRestraintsGenerator", -) - -has_mdrestraints_generator = _have_imported(MDRestraintsGenerator) +# Check for alchemlyb. +_alchemlyb = _try_import("alchemlyb") +has_alchemlyb = _have_imported(_alchemlyb) From 813dfba73eb6bae8be491853821fd145cfbffa7f Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 10:54:26 +0100 Subject: [PATCH 15/50] Unpin alchemlyb requirement. --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index a6cc8bfa2..ae3783e34 100644 --- a/python/setup.py +++ b/python/setup.py @@ -108,7 +108,7 @@ def clear_installed_list(): # Create a list of the conda dependencies. conda_deps = [ - "alchemlyb<2", # known not available on aarch64 + "alchemlyb", # known not available on aarch64 "configargparse", "ipywidgets<8", "kcombu_bss", From cbc244d0cf85bda771297ae9ecdd1088d41756be Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 10:54:45 +0100 Subject: [PATCH 16/50] Add alchemlyb requirement. --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2ad99fd2f..144346d7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,6 @@ rdkit # The below are packages that aren't available on all # platforms/OSs and so need to be conditionally included -mdtraj ; platform_machine != "aarch64" # not on Linux/aarch64 - +alchemlyb ; platform_machine != "aarch64" # Needs pymbar, not on Linux/aarch64 mdanalysis ; platform_machine != "aarch64" # not on Linux/aarch64 +mdtraj ; platform_machine != "aarch64" # not on Linux/aarch64 From 22be4e15587f313049c6607d1b2e786ca32129cd Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 11:14:41 +0100 Subject: [PATCH 17/50] Change dictionary name. --- python/BioSimSpace/FreeEnergy/_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index a971eca8b..10b32aedb 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1064,7 +1064,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) from functools import partial - function_glob_dict = { + function_dict = { "SOMD": ( staticmethod(partial(Relative._somd_extract, estimator=estimator)) ), @@ -1073,7 +1073,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) } # Extract the data. - func = function_glob_dict[engine] + func = function_dict[engine] try: data = [func(x, T=t) for x, t in zip(files, temperatures)] except Exception as e: From 68a13d49a4e4db88d0c246b9b35faaeba340025c Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 11:22:33 +0100 Subject: [PATCH 18/50] Formatting tweak. --- tests/FreeEnergy/test_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/FreeEnergy/test_relative.py b/tests/FreeEnergy/test_relative.py index 2d32fb0d9..fc82f92d1 100644 --- a/tests/FreeEnergy/test_relative.py +++ b/tests/FreeEnergy/test_relative.py @@ -86,7 +86,7 @@ def test_analysis(fep_output, engine, estimator, expected_results): free_nrg_free, _ = BSS.FreeEnergy.Relative.analyse(path_free, estimator=estimator) free_nrg_vac, _ = BSS.FreeEnergy.Relative.analyse(path_vac, estimator=estimator) - # Compute the free energy difference. + # Compute the free-energy difference. delta_g = BSS.FreeEnergy.Relative.difference(free_nrg_free, free_nrg_vac) # Make sure the result is close to the expected value. From 9f69f2a8b5909f6122bd1eb498550e3192a6bedc Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 12:21:17 +0100 Subject: [PATCH 19/50] Remove GROMACS version check since meaningless in a staticmethod. If this is important, then we can just raise a meaningful exception if the alchemlyb analysis fails. We should be able to determine the version using the header from the XVG file. In fact, the test files were generated with GROMACS 2018, but still work fine with alchemlyb, so the check doesn't appear to be necessary. --- python/BioSimSpace/FreeEnergy/_relative.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 10b32aedb..69287eb1f 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -82,7 +82,6 @@ from sire.legacy import Mol as _SireMol from .. import _gmx_exe -from .. import _gmx_version from .. import _is_notebook from .. import _isVerbose from .._Exceptions import AnalysisError as _AnalysisError @@ -1269,12 +1268,6 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar if method not in ["ALCHEMLYB", "NATIVE"]: raise ValueError("'method' must be either 'alchemlyb' or 'native'.") - if _gmx_version <= 2020: - _warnings.warn( - "Analysing using 'native' gmx bar and BAR as the gromacs version is older..." - ) - method = "NATIVE" - if method == "ALCHEMLYB": # Find the output files and work out the lambda windows from the directory names. files = sorted(_glob(work_dir + "/lambda_*/gromacs.xvg")) From b05f707469ba2cb4fe145b93819c0d2119942c76 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 12:33:28 +0100 Subject: [PATCH 20/50] Check GROMACS is installed when performing native analysis. --- python/BioSimSpace/FreeEnergy/_relative.py | 33 +++++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 69287eb1f..39a618f20 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1303,12 +1303,17 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar # For the older gromacs versions and native use the gmx bar analysis. elif method == "NATIVE": + if _gmx_exe is None: + raise _MissingSoftwareError( + "Cannot use native gmx bar analysis as GROMACS is not installed!" + ) + _warnings.warn( - "using 'native' for GROMACS does not return an overlap/dHdl." + "using 'native' for gromacs does not return an overlap/dhdl." ) - _warnings.warn("using 'native' for GROMACS uses BAR.") + _warnings.warn("using 'native' for gromacs uses bar.") - # Create the command. + # create the command. xvg_files = _glob(f"{work_dir}/lambda_*/*.xvg") command = "%s bar -f %s -o %s/bar.xvg" % ( _gmx_exe, @@ -1316,27 +1321,27 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar work_dir, ) - # Run the first command. + # run the first command. proc = _subprocess.run( - _Utils.command_split(command), - shell=False, - stdout=_subprocess.PIPE, - stderr=_subprocess.PIPE, + _utils.command_split(command), + shell=false, + stdout=_subprocess.pipe, + stderr=_subprocess.pipe, ) if proc.returncode != 0: - raise _AnalysisError("Native GROMACS free-energy analysis failed!") + raise _analysiserror("native gromacs free-energy analysis failed!") - # Initialise list to hold the data. + # initialise list to hold the data. data = [] - # Extract the data from the output files. + # extract the data from the output files. - # First leg. + # first leg. with open("%s/bar.xvg" % work_dir) as file: - # Read all of the lines into a list. + # read all of the lines into a list. lines = [] for line in file: - # Ignore comments and xmgrace directives. + # ignore comments and xmgrace directives. if line[0] != "#" and line[0] != "@": lines.append(line.rstrip()) From 1849ee4a3cf08063d412f4fd01a543261da167ed Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 12:41:20 +0100 Subject: [PATCH 21/50] Fix ridiculous Copilot case adjustments. (What was it doing?) --- python/BioSimSpace/FreeEnergy/_relative.py | 30 ++++++++++------------ 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 39a618f20..ea4651480 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1309,11 +1309,11 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar ) _warnings.warn( - "using 'native' for gromacs does not return an overlap/dhdl." + "using 'native' for GROMACS does not return an overlap/dHdl." ) - _warnings.warn("using 'native' for gromacs uses bar.") + _warnings.warn("using 'native' for GROMACS uses BAR.") - # create the command. + # Create the command. xvg_files = _glob(f"{work_dir}/lambda_*/*.xvg") command = "%s bar -f %s -o %s/bar.xvg" % ( _gmx_exe, @@ -1321,27 +1321,25 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar work_dir, ) - # run the first command. + # Run the command. proc = _subprocess.run( - _utils.command_split(command), - shell=false, - stdout=_subprocess.pipe, - stderr=_subprocess.pipe, + _Utils.command_split(command), + shell=False, + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, ) if proc.returncode != 0: - raise _analysiserror("native gromacs free-energy analysis failed!") + raise _AnalysisError("native GROMACS free-energy analysis failed!") - # initialise list to hold the data. + # Initialise list to hold the data. data = [] - # extract the data from the output files. - - # first leg. + # Extract the data from the output files. with open("%s/bar.xvg" % work_dir) as file: - # read all of the lines into a list. + # Read all of the lines into a list. lines = [] for line in file: - # ignore comments and xmgrace directives. + # Ignore comments and xmgrace directives. if line[0] != "#" and line[0] != "@": lines.append(line.rstrip()) @@ -1518,8 +1516,6 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) overlap = [] # Extract the data from the output files. - - # First leg. with open("%s/mbar.txt" % work_dir) as file: # Process the MBAR data. for line in file: From 88723540cfc4bb62884a15be6d99d0e7ea01743f Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 16:15:25 +0100 Subject: [PATCH 22/50] Use pathlib for OS agnostic file globbing. --- python/BioSimSpace/FreeEnergy/_relative.py | 58 ++++++++++++++-------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index ea4651480..d6b4955e4 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -26,13 +26,12 @@ __all__ = ["Relative", "getData"] -from glob import glob as _glob - import copy as _copy import math as _math import numpy as _np import os as _os import pandas as _pd +import pathlib as _pathlib import shutil as _shutil import subprocess as _subprocess import sys as _sys @@ -426,13 +425,14 @@ def getData(self, name="data", file_link=False, work_dir=None): # Change into the working directory. with _cd(work_dir): - # Glob all of the analysis files. + # Specify the path to glob. + glob_path = _pathlib.Path(work_dir) # First try SOMD data. - files = _glob("*/*/gradients.dat") + files = glob_path.glob("**/gradients.dat") if len(files) == 0: - files = _glob("*/*/gromacs.xvg") + files = glob_path.glob("**/[!bar]*.xvg") if len(files) == 0: raise ValueError( @@ -510,13 +510,14 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): _assert_imported("alchemlyb") function_glob_dict = { - "SOMD": (Relative._analyse_somd, "/lambda_*/simfile.dat"), - "GROMACS": (Relative._analyse_gromacs, "/lambda_*/gromacs.xvg"), - "AMBER": (Relative._analyse_amber, "/lambda_*/amber.out"), + "SOMD": (Relative._analyse_somd, "**/simfile.dat"), + "GROMACS": (Relative._analyse_gromacs, "**/[!bar]*.xvg"), + "AMBER": (Relative._analyse_amber, "**/*.out"), } for engine, (func, mask) in function_glob_dict.items(): - data = _glob(work_dir + mask) + glob_path = _pathlib.Path(work_dir) + data = sorted(glob_path.glob(mask)) if data and engine == "AMBER": if method != "ALCHEMLYB": raise _AnalysisError( @@ -738,8 +739,8 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): window for TI. """ - if not isinstance(simfile, str): - raise TypeError("'simfile' must be of type 'str'.") + if not isinstance(simfile, _pathlib.Path): + raise TypeError("'simfile' must be of type 'pathlib.Path'.") if not _os.path.isfile(simfile): raise ValueError("'simfile' doesn't exist!") @@ -1030,8 +1031,8 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) if not isinstance(files, (tuple, list)): raise TypeError("'files' must be of type 'list' or 'tuple'.") - if not all(isinstance(x, str) for x in files): - raise TypeError("'files' must be a list of 'str' types.") + if not all(isinstance(x, _pathlib.Path) for x in files): + raise TypeError("'files' must be a list of 'pathlib.Path' types.") if not isinstance(temperatures, (tuple, list)): raise TypeError("'temperatures' must be of type 'list' or 'tuple'.") @@ -1194,8 +1195,13 @@ def _analyse_amber(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs ) # Find the output files and work out the lambda windows from the directory names. - files = sorted(_glob(work_dir + "/lambda_*/amber.out")) - lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + glob_path = _pathlib.Path(work_dir) + files = sorted(glob_path.glob("**/*.out")) + lambdas = [] + for file in files: + for part in file.parts: + if "lambda" in part: + lambdas.append(float(part.split("_")[-1])) # Find the temperature for each lambda window. temperatures = [] @@ -1270,8 +1276,13 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar if method == "ALCHEMLYB": # Find the output files and work out the lambda windows from the directory names. - files = sorted(_glob(work_dir + "/lambda_*/gromacs.xvg")) - lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + glob_path = _pathlib.Path(work_dir) + files = sorted(glob_path.glob("**/[!bar]*.xvg")) + lambdas = [] + for file in files: + for part in file.parts: + if "lambda" in part: + lambdas.append(float(part.split("_")[-1])) # Find the temperature at each lambda window temperatures = [] @@ -1314,7 +1325,9 @@ def _analyse_gromacs(work_dir=None, estimator="MBAR", method="alchemlyb", **kwar _warnings.warn("using 'native' for GROMACS uses BAR.") # Create the command. - xvg_files = _glob(f"{work_dir}/lambda_*/*.xvg") + glob_path = _pathlib.Path(work_dir) + xvg_files = sorted(glob_path.glob("**/[!bar]*.xvg")) + xvg_files = [str(file.absolute()) for file in xvg_files] command = "%s bar -f %s -o %s/bar.xvg" % ( _gmx_exe, " ".join(xvg_files), @@ -1432,8 +1445,13 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) if method == "ALCHEMLYB": # Glob the data files and work out the lambda values. - files = sorted(_glob(work_dir + "/lambda_*/simfile.dat")) - lambdas = [float(x.split("/")[-2].split("_")[-1]) for x in files] + glob_path = _pathlib.Path(work_dir) + files = sorted(glob_path.glob("**/simfile.dat")) + lambdas = [] + for file in files: + for part in file.parts: + if "lambda" in part: + lambdas.append(float(part.split("_")[-1])) temperatures = [] for file in files: From d6068b5dc0edf6d46cbd518e90b7a7325d7fadbd Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 16:48:01 +0100 Subject: [PATCH 23/50] No need to wrap partial in a staticmethod. --- python/BioSimSpace/FreeEnergy/_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index d6b4955e4..ce2f2b1b2 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1066,7 +1066,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) function_dict = { "SOMD": ( - staticmethod(partial(Relative._somd_extract, estimator=estimator)) + partial(Relative._somd_extract, estimator=estimator) ), "GROMACS": (_gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl), "AMBER": (_amber_extract_u_nk if is_mbar else _amber_extract_dHdl), From 2da166099ea0771a49654731f0c74cc7cf1077bc Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 11 Aug 2023 17:52:59 +0100 Subject: [PATCH 24/50] Blacken. --- python/BioSimSpace/FreeEnergy/_relative.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index ce2f2b1b2..9e4402723 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1065,9 +1065,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) from functools import partial function_dict = { - "SOMD": ( - partial(Relative._somd_extract, estimator=estimator) - ), + "SOMD": (partial(Relative._somd_extract, estimator=estimator)), "GROMACS": (_gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl), "AMBER": (_amber_extract_u_nk if is_mbar else _amber_extract_dHdl), } From c835706f4a6df02673a85dac22cb793117feee84 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 14 Aug 2023 13:24:37 +0100 Subject: [PATCH 25/50] Refactor to provide separate u_nk and dhdl extraction functions. --- python/BioSimSpace/FreeEnergy/_relative.py | 145 +++++++++++++++++++-- 1 file changed, 131 insertions(+), 14 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 9e4402723..be86101c2 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -683,6 +683,132 @@ def difference(pmf, pmf_ref): return (free_energy, error) + @staticmethod + def _get_data(files, temperatures, engine, estimator): + """ + files : list(pathlib.Path) + List of files for all lambda values to analyse. Should be sorted. + + temperatures : list(float) + List of temperatures at which the simulation was carried out at for each lambda window. + Index of the temperature value should match it's corresponding lambda window index in files. + + lambdas : list(float) + Sorted list of lambda values used for the simulation. + + engine : str + Engine with which the simulation was run. + + estimator : str + The estimator to use for the analysis. Options are "MBAR" or "TI". + + Returns + ------- + + data : list(pandas.DataFrame) + A list of dataframes containing the data for each lambda window. + """ + + if not isinstance(files, (tuple, list)): + raise TypeError("'files' must be of type 'list' or 'tuple'.") + if not all(isinstance(x, _pathlib.Path) for x in files): + raise TypeError("'files' must be a list of 'pathlib.Path' types.") + + if not isinstance(temperatures, (tuple, list)): + raise TypeError("'temperatures' must be of type 'list' or 'tuple'.") + if not all(isinstance(x, float) for x in temperatures): + raise TypeError("'temperatures' must be a list of 'float' types.") + + if not isinstance(engine, str): + raise TypeError("'engine' must be of type 'str'.") + engine = engine.replace(" ", "").upper() + if not engine in Relative._engines: + raise ValueError( + f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines)}" + ) + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + estimator = estimator.replace(" ", "").upper() + if not estimator in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if estimator == "MBAR": + is_mbar = True + else: + is_mbar = False + + from functools import partial + + function_dict = { + "SOMD": partial(Relative._somd_extract, estimator=estimator), + "GROMACS": _gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl, + "AMBER": _amber_extract_u_nk if is_mbar else _amber_extract_dHdl, + } + + # Extract the data. + func = function_dict[engine] + try: + data = [func(file, T=temp) for file, temp in zip(files, temperatures)] + except Exception as e: + msg = "Could not extract the data from the provided files!" + if _isVerbose(): + raise _AnalysisError(msg) from e + else: + raise _AnalysisError(msg) from None + + return data + + @staticmethod + def _get_u_nk(files, temperatures, engine): + """ + Get the u_nk dataframes for MBAR analysis. + + Parameters + ---------- + + files : list(pathlib.Path) + A list of data files. + + temperatures : list(float) + A list of temperatures. + + engine : str + The simulation engine used to generate the data. + + Returns + ------- + + u_nk : list(pandas.DataFrame) + A list of dataframes containing the u_nk data for each lambda window. + """ + return _Relative._get_data(files, temperatures, engine, "MBAR") + + @staticmethod + def _get_dh_dl(files, temperatures, engine): + """ + Get the dh_dl dataframes for TI analysis. + + Parameters + ---------- + + files : list(pathlib.Path) + A list of data files. + + temperatures : list(float) + A list of temperatures. + + engine : str + The simulation engine used to generate the data. + + Returns + ------- + + dh_dl : list(pandas.DataFrame) + A list of dataframes containing the u_nk data for each lambda window. + """ + return _Relative._get_data(files, temperatures, engine, "TI") + def _analyse(self, estimator="MBAR"): """ Analyse free-energy data for this object. @@ -999,14 +1125,14 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) Parameters ---------- - files : list + files : list(pathlib.Path) List of files for all lambda values to analyse. Should be sorted. - temperatures : list + temperatures : list(float) List of temperatures at which the simulation was carried out at for each lambda window. Index of the temperature value should match it's corresponding lambda window index in files. - lambdas : list + lambdas : list(float) Sorted list of lambda values used for the simulation. engine : str @@ -1062,18 +1188,9 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) else: is_mbar = False - from functools import partial - - function_dict = { - "SOMD": (partial(Relative._somd_extract, estimator=estimator)), - "GROMACS": (_gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl), - "AMBER": (_amber_extract_u_nk if is_mbar else _amber_extract_dHdl), - } - # Extract the data. - func = function_dict[engine] try: - data = [func(x, T=t) for x, t in zip(files, temperatures)] + data = Relative._get_data(files, temperatures, engine, estimator) except Exception as e: msg = "Could not extract the data from the provided files!" if _isVerbose(): @@ -1090,7 +1207,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) mbar_method = None if is_mbar: - # Check kwargs incase there is an mbar_method and then use this + # Check kwargs in case there is an mbar_method and then use this for key, value in kwargs.items(): key = key.replace(" ", "").replace("_", "").upper() if key == "MBARMETHOD": From 0ab909c8d65ccbf6dec842e66bd8e3ca0378d3fd Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 15 Aug 2023 16:25:36 +0100 Subject: [PATCH 26/50] Fix comments. [ci skip] --- python/BioSimSpace/Process/_openmm.py | 2 +- python/BioSimSpace/Sandpit/Exscientia/Process/_openmm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/Process/_openmm.py b/python/BioSimSpace/Process/_openmm.py index fc0871637..a5ec03c79 100644 --- a/python/BioSimSpace/Process/_openmm.py +++ b/python/BioSimSpace/Process/_openmm.py @@ -2149,7 +2149,7 @@ def _add_config_restraints(self): ) self.addToConfig( - "\n# Restrain the position of backbone atoms using zero-mass dummy atoms." + "\n# Restrain the position of atoms using zero-mass dummy atoms." ) self.addToConfig("restraint = HarmonicBondForce()") self.addToConfig("restraint.setUsesPeriodicBoundaryConditions(True)") diff --git a/python/BioSimSpace/Sandpit/Exscientia/Process/_openmm.py b/python/BioSimSpace/Sandpit/Exscientia/Process/_openmm.py index 35b830b65..ff22948c8 100644 --- a/python/BioSimSpace/Sandpit/Exscientia/Process/_openmm.py +++ b/python/BioSimSpace/Sandpit/Exscientia/Process/_openmm.py @@ -1971,7 +1971,7 @@ def _add_config_platform(self): ) def _add_config_restraints(self): - # Add backbone restraints. This uses the approach from: + # Add position restraints. This uses the approach from: # https://github.com/openmm/openmm/issues/2262#issuecomment-464157489 # Here zero-mass dummy atoms are bonded to the restrained atoms to avoid # issues with position rescaling during barostat updates. From 7cc2d8a43accab850e3297a3e0ca624925c5532c Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 21 Aug 2023 14:14:12 +0100 Subject: [PATCH 27/50] Add @annamherz's suggestions. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 1 + python/BioSimSpace/FreeEnergy/_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index be86101c2..23a16b66a 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -32,6 +32,7 @@ import os as _os import pandas as _pd import pathlib as _pathlib +import re as _re import shutil as _shutil import subprocess as _subprocess import sys as _sys diff --git a/python/BioSimSpace/FreeEnergy/_utils.py b/python/BioSimSpace/FreeEnergy/_utils.py index 704b8afb8..ec79e3c45 100644 --- a/python/BioSimSpace/FreeEnergy/_utils.py +++ b/python/BioSimSpace/FreeEnergy/_utils.py @@ -38,4 +38,4 @@ def engines(): engines : [str] The list of supported engines. """ - return ["Somd", "Gromacs"] + return ["SOMD", "GROMACS"] From 594210f5cdacae0657106b954a1fcb8121eede96 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 21 Aug 2023 14:19:14 +0100 Subject: [PATCH 28/50] Relative class isn't private. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 23a16b66a..00bc0ac18 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -783,7 +783,7 @@ def _get_u_nk(files, temperatures, engine): u_nk : list(pandas.DataFrame) A list of dataframes containing the u_nk data for each lambda window. """ - return _Relative._get_data(files, temperatures, engine, "MBAR") + return Relative._get_data(files, temperatures, engine, "MBAR") @staticmethod def _get_dh_dl(files, temperatures, engine): @@ -808,7 +808,7 @@ def _get_dh_dl(files, temperatures, engine): dh_dl : list(pandas.DataFrame) A list of dataframes containing the u_nk data for each lambda window. """ - return _Relative._get_data(files, temperatures, engine, "TI") + return Relative._get_data(files, temperatures, engine, "TI") def _analyse(self, estimator="MBAR"): """ From e6c2ee9548d8fc8a5f290c0b37f2e1de5a84728c Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 22 Aug 2023 16:07:22 +0100 Subject: [PATCH 29/50] Allow support for different engines for analysis only. --- python/BioSimSpace/FreeEnergy/_relative.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 00bc0ac18..1111847ba 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -120,9 +120,12 @@ class Relative: """Class for configuring and running relative free-energy perturbation simulations.""" - # Create a list of supported molecular dynamics engines. + # Create a list of supported molecular dynamics engines. (For running simulations.) _engines = ["GROMACS", "SOMD"] + # Create a list of supported molecular dynamics engines. (For analysis.) + _engines_analysis = ["AMBER", "GROMACS", "SOMD"] + def __init__( self, system, @@ -723,9 +726,9 @@ def _get_data(files, temperatures, engine, estimator): if not isinstance(engine, str): raise TypeError("'engine' must be of type 'str'.") engine = engine.replace(" ", "").upper() - if not engine in Relative._engines: + if not engine in Relative._engines_analysis: raise ValueError( - f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines)}" + f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines_analysis)}" ) if not isinstance(estimator, str): @@ -1173,9 +1176,9 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) if not isinstance(engine, str): raise TypeError("'engine' must be of type 'str'.") - if not engine.replace(" ", "").upper() in Relative._engines: + if not engine.replace(" ", "").upper() in Relative._engines_analysis: raise ValueError( - f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines)}" + f"Unsupported engine '{engine}'. Options are: {', '.join(Relative._engines_analysis)}" ) if not isinstance(estimator, str): From 0b97a4ac67adb62a34bb35dd053169b765899e5a Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 22 Aug 2023 16:22:26 +0100 Subject: [PATCH 30/50] Make sure np.matrix is used for all overlap objects. --- python/BioSimSpace/FreeEnergy/_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 1111847ba..3756cb90b 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1262,7 +1262,7 @@ def _analyse_internal(files, temperatures, lambdas, engine, estimator, **kwargs) ) if is_mbar: - return (data, alchem.overlap_matrix) + return (data, _np.matrix(alchem.overlap_matrix)) else: return (data, None) @@ -1694,7 +1694,7 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) # Get the next line. row = next(file) - return (data, overlap) + return (data, _np.matrix(overlap)) def _checkOverlap(self, estimator="MBAR", threshold=0.03): """ From 42ebe5a575b68ca84df72342d8c9b2f25ae52a53 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 22 Aug 2023 16:23:07 +0100 Subject: [PATCH 31/50] Use try/except on NumPy array conversion. --- python/BioSimSpace/Notebook/_plot.py | 11 +++++++++-- .../BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/Notebook/_plot.py b/python/BioSimSpace/Notebook/_plot.py index f2cd09522..fdcb470ef 100644 --- a/python/BioSimSpace/Notebook/_plot.py +++ b/python/BioSimSpace/Notebook/_plot.py @@ -546,10 +546,12 @@ def plotOverlapMatrix( overlap : List of List of float, or 2D numpy array of float The overlap matrix. + continuous_cbar : bool, optional, default=False If True, use a continuous colour bar. Otherwise, use a discrete set of values defined by the 'color_bar_cutoffs' argument to assign a colour to each element in the matrix. + color_bar_cutoffs : List of float, optional, default=[0.03, 0.1, 0.3] The cutoffs to use when assigning a colour to each element in the matrix. This is used for both the continuous and discrete color bars. @@ -575,8 +577,13 @@ def plotOverlapMatrix( "The 'overlap' matrix must be a list of list types, or a numpy array!" ) - # Convert to a numpy array - no issues if this is already an array. - overlap = _np.array(overlap) + # Try converting to a NumPy array. + try: + overlap = _np.array(overlap) + except: + raise TypeError( + "'overlap' must be of type 'np.matrix', 'np.ndarray', or a list of lists." + ) # Store the number of rows. num_rows = len(overlap) diff --git a/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py b/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py index f2cd09522..fdcb470ef 100644 --- a/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py +++ b/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py @@ -546,10 +546,12 @@ def plotOverlapMatrix( overlap : List of List of float, or 2D numpy array of float The overlap matrix. + continuous_cbar : bool, optional, default=False If True, use a continuous colour bar. Otherwise, use a discrete set of values defined by the 'color_bar_cutoffs' argument to assign a colour to each element in the matrix. + color_bar_cutoffs : List of float, optional, default=[0.03, 0.1, 0.3] The cutoffs to use when assigning a colour to each element in the matrix. This is used for both the continuous and discrete color bars. @@ -575,8 +577,13 @@ def plotOverlapMatrix( "The 'overlap' matrix must be a list of list types, or a numpy array!" ) - # Convert to a numpy array - no issues if this is already an array. - overlap = _np.array(overlap) + # Try converting to a NumPy array. + try: + overlap = _np.array(overlap) + except: + raise TypeError( + "'overlap' must be of type 'np.matrix', 'np.ndarray', or a list of lists." + ) # Store the number of rows. num_rows = len(overlap) From 9f522d7d57db488228e32f331099e313944ff7bd Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 1 Sep 2023 13:13:37 +0100 Subject: [PATCH 32/50] Add support for generating SOMD2 input. [ci skip] --- nodes/playground/prepareFEP.ipynb | 76 ++++++++++++++++++++----------- nodes/playground/prepareFEP.py | 74 +++++++++++++++++++----------- 2 files changed, 97 insertions(+), 53 deletions(-) diff --git a/nodes/playground/prepareFEP.ipynb b/nodes/playground/prepareFEP.ipynb index 41359c106..a9929c669 100644 --- a/nodes/playground/prepareFEP.ipynb +++ b/nodes/playground/prepareFEP.ipynb @@ -159,6 +159,13 @@ " BSS.Gateway.String(\n", " help=\"The root name for the files describing the perturbation input1->input2.\"\n", " ),\n", + ")\n", + "node.addInput(\n", + " \"somd2\",\n", + " BSS.Gateway.Boolean(\n", + " help=\"Whether to generate input for SOMD2.\",\n", + " default=False,\n", + " ),\n", ")" ] }, @@ -321,18 +328,28 @@ "source": [ "# Log the mapping used\n", "writeLog(lig1, lig2, mapping)\n", + "# Are we saving output for SOMD2?\n", + "is_somd2 = node.getInput(\"somd2\")\n", + "# File root for all output.\n", + "root = node.getInput(\"output\")\n", "BSS.IO.saveMolecules(\n", " \"merged_at_lam0.pdb\",\n", " merged,\n", " \"PDB\",\n", " {\"coordinates\": \"coordinates0\", \"bond\": \"bond0\", \"element\": \"element0\"},\n", ")\n", - "# Generate package specific input\n", - "protocol = BSS.Protocol.FreeEnergy(runtime=2 * BSS.Units.Time.femtosecond, num_lam=3)\n", - "process = BSS.Process.Somd(system1, protocol)\n", - "process.getOutput()\n", - "with zipfile.ZipFile(\"somd_output.zip\", \"r\") as zip_hnd:\n", - " zip_hnd.extractall(\".\")" + "if is_somd2:\n", + " BSS.Stream.save(system1, root)\n", + " stream_file = \"%s.bss\" % root\n", + "else:\n", + " # Generate package specific input\n", + " protocol = BSS.Protocol.FreeEnergy(\n", + " runtime=2 * BSS.Units.Time.femtosecond, num_lam=3\n", + " )\n", + " process = BSS.Process.Somd(system1, protocol)\n", + " process.getOutput()\n", + " with zipfile.ZipFile(\"somd_output.zip\", \"r\") as zip_hnd:\n", + " zip_hnd.extractall(\".\")" ] }, { @@ -341,12 +358,12 @@ "metadata": {}, "outputs": [], "source": [ - "root = node.getInput(\"output\")\n", - "mergedpdb = \"%s.mergeat0.pdb\" % root\n", - "pert = \"%s.pert\" % root\n", - "prm7 = \"%s.prm7\" % root\n", - "rst7 = \"%s.rst7\" % root\n", - "mapping_str = \"%s.mapping\" % root" + "if not is_somd2:\n", + " mergedpdb = \"%s.mergeat0.pdb\" % root\n", + " pert = \"%s.pert\" % root\n", + " prm7 = \"%s.prm7\" % root\n", + " rst7 = \"%s.rst7\" % root\n", + " mapping_str = \"%s.mapping\" % root" ] }, { @@ -355,18 +372,19 @@ "metadata": {}, "outputs": [], "source": [ - "os.replace(\"merged_at_lam0.pdb\", mergedpdb)\n", - "os.replace(\"somd.pert\", pert)\n", - "os.replace(\"somd.prm7\", prm7)\n", - "os.replace(\"somd.rst7\", rst7)\n", - "os.replace(\"somd.mapping\", mapping_str)\n", - "try:\n", - " os.remove(\"somd_output.zip\")\n", - " os.remove(\"somd.cfg\")\n", - " os.remove(\"somd.err\")\n", - " os.remove(\"somd.out\")\n", - "except Exception:\n", - " pass" + "if not is_somd2:\n", + " os.replace(\"merged_at_lam0.pdb\", mergedpdb)\n", + " os.replace(\"somd.pert\", pert)\n", + " os.replace(\"somd.prm7\", prm7)\n", + " os.replace(\"somd.rst7\", rst7)\n", + " os.replace(\"somd.mapping\", mapping_str)\n", + " try:\n", + " os.remove(\"somd_output.zip\")\n", + " os.remove(\"somd.cfg\")\n", + " os.remove(\"somd.err\")\n", + " os.remove(\"somd.out\")\n", + " except Exception:\n", + " pass" ] }, { @@ -375,7 +393,11 @@ "metadata": {}, "outputs": [], "source": [ - "node.setOutput(\"nodeoutput\", [mergedpdb, pert, prm7, rst7, mapping_str])" + "if is_somd2:\n", + " output = [stream_file]\n", + "else:\n", + " output = [mergedpdb, pert, prm7, rst7, mapping_str]\n", + "node.setOutput(\"nodeoutput\", output)" ] }, { @@ -390,7 +412,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -404,7 +426,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/nodes/playground/prepareFEP.py b/nodes/playground/prepareFEP.py index 762366daa..5561f7158 100644 --- a/nodes/playground/prepareFEP.py +++ b/nodes/playground/prepareFEP.py @@ -49,7 +49,7 @@ def writeLog(ligA, ligB, mapping): def loadMapping(mapping_file): - """Parse a text file that specifies mappings between atomic indices in input1 --> atoms in input2.""" + """Parse a text file that specifies mappings between atomic indices in input1 --> atoms in input2""" stream = open(mapping_file, "r") buffer = stream.readlines() stream.close() @@ -133,6 +133,13 @@ def loadMapping(mapping_file): help="The root name for the files describing the perturbation input1->input2." ), ) +node.addInput( + "somd2", + BSS.Gateway.Boolean( + help="Whether to generate input for SOMD2.", + default=False, + ), +) # In[ ]: @@ -258,52 +265,67 @@ def loadMapping(mapping_file): # Log the mapping used writeLog(lig1, lig2, mapping) +# Are we saving output for SOMD2? +is_somd2 = node.getInput("somd2") +# File root for all output. +root = node.getInput("output") BSS.IO.saveMolecules( "merged_at_lam0.pdb", merged, "PDB", {"coordinates": "coordinates0", "bond": "bond0", "element": "element0"}, ) -# Generate package specific input -protocol = BSS.Protocol.FreeEnergy(runtime=2 * BSS.Units.Time.femtosecond, num_lam=3) -process = BSS.Process.Somd(system1, protocol) -process.getOutput() -with zipfile.ZipFile("somd_output.zip", "r") as zip_hnd: - zip_hnd.extractall(".") +if is_somd2: + BSS.Stream.save(system1, root) + stream_file = "%s.bss" % root +else: + # Generate package specific input + protocol = BSS.Protocol.FreeEnergy( + runtime=2 * BSS.Units.Time.femtosecond, num_lam=3 + ) + process = BSS.Process.Somd(system1, protocol) + process.getOutput() + with zipfile.ZipFile("somd_output.zip", "r") as zip_hnd: + zip_hnd.extractall(".") # In[ ]: -root = node.getInput("output") -mergedpdb = "%s.mergeat0.pdb" % root -pert = "%s.pert" % root -prm7 = "%s.prm7" % root -rst7 = "%s.rst7" % root -mapping_str = "%s.mapping" % root +if not is_somd2: + mergedpdb = "%s.mergeat0.pdb" % root + pert = "%s.pert" % root + prm7 = "%s.prm7" % root + rst7 = "%s.rst7" % root + mapping_str = "%s.mapping" % root # In[ ]: -os.replace("merged_at_lam0.pdb", mergedpdb) -os.replace("somd.pert", pert) -os.replace("somd.prm7", prm7) -os.replace("somd.rst7", rst7) -os.replace("somd.mapping", mapping_str) -try: - os.remove("somd_output.zip") - os.remove("somd.cfg") - os.remove("somd.err") - os.remove("somd.out") -except Exception: - pass +if not is_somd2: + os.replace("merged_at_lam0.pdb", mergedpdb) + os.replace("somd.pert", pert) + os.replace("somd.prm7", prm7) + os.replace("somd.rst7", rst7) + os.replace("somd.mapping", mapping_str) + try: + os.remove("somd_output.zip") + os.remove("somd.cfg") + os.remove("somd.err") + os.remove("somd.out") + except Exception: + pass # In[ ]: -node.setOutput("nodeoutput", [mergedpdb, pert, prm7, rst7, mapping_str]) +if is_somd2: + output = [stream_file] +else: + output = [mergedpdb, pert, prm7, rst7, mapping_str] +node.setOutput("nodeoutput", output) # In[ ]: From 3c3129cb011ef338ec531c1c403f5ceebb01a048 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 1 Sep 2023 20:54:28 +0100 Subject: [PATCH 33/50] Simplify SOMD data to dataframe conversion. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 3756cb90b..7e712d18a 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -889,7 +889,7 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): is_mbar = False # For dhdl need to consider the temperature, as the gradient is in - # kcal/mol in the simfile.dat . + # kcal/mol in the simfile.dat. if not is_mbar: k_b = _R_kJmol * _kJ2kcal beta = 1 / (k_b * T) @@ -971,23 +971,13 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): # For MBAR, results in list of lists where each list is the 0 to 1 # window values that lambda value. For TI, it is a list of gradients # at that lambda. - results = [] if is_mbar: - # For the energies for each lambda window, append the kt to the - # data list of values for all lambda windows. - for t in time_rows: - row = file_df.loc[t][lambda_array].to_numpy() - E_ref = row[lambda_array.index(lambda_win)] - energies = [] - for lam in lambda_array: - E_ = row[lambda_array.index(lam)] - energies.append((E_ - E_ref)) - results.append(energies) + results = ( + file_df.iloc[:, 5:].subtract(file_df[lambda_win], axis=0).to_numpy() + ) else: - for t in time_rows: - gradient = file_df.loc[t]["gradient_kcal/mol"] - red_gradient = gradient * beta - results.append(red_gradient) + gradient = file_df["gradient_kcal/mol"].to_numpy() + results = gradient * beta # Turn into a dataframe that can be processed by alchemlyb. if is_mbar: From a5c5e1e384fc4ceed34bc1af496837a5a78f34f3 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 1 Sep 2023 21:50:49 +0100 Subject: [PATCH 34/50] Add initial support for SOMD2 FEP analysis. [ci skip] This seems to be working using the TI estimator, but not with MBAR. The error implies that the number of samples is inconsistent between lambda values, which doesn't appear to be the case. Hopefully a simple formatting error. --- python/BioSimSpace/FreeEnergy/_relative.py | 290 ++++++++++++++++++++- python/setup.py | 1 + requirements.txt | 1 + 3 files changed, 290 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 7e712d18a..27ff5f928 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -27,11 +27,13 @@ __all__ = ["Relative", "getData"] import copy as _copy +import json as _json import math as _math import numpy as _np import os as _os import pandas as _pd import pathlib as _pathlib +import pyarrow.parquet as _pq import re as _re import shutil as _shutil import subprocess as _subprocess @@ -124,7 +126,7 @@ class Relative: _engines = ["GROMACS", "SOMD"] # Create a list of supported molecular dynamics engines. (For analysis.) - _engines_analysis = ["AMBER", "GROMACS", "SOMD"] + _engines_analysis = ["AMBER", "GROMACS", "SOMD", "SOMD2"] def __init__( self, @@ -515,6 +517,7 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): function_glob_dict = { "SOMD": (Relative._analyse_somd, "**/simfile.dat"), + "SOMD2": (Relative._analyse_somd2, "**/*.parquet"), "GROMACS": (Relative._analyse_gromacs, "**/[!bar]*.xvg"), "AMBER": (Relative._analyse_amber, "**/*.out"), } @@ -531,6 +534,11 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): raise _AnalysisError( "SOMD cannot use 'TI' estimator with 'native' analysis method." ) + if data and engine == "SOMD2": + if method != "ALCHEMLYB": + raise _AnalysisError( + f"SOMD2 can only use the 'alchemlyb' analysis method." + ) if data and engine == "GROMACS" and method == "native": _warnings.warn( "Native GROMACS analysis cannot do MBAR/TI. BAR will be used." @@ -538,7 +546,9 @@ def analyse(work_dir, estimator="MBAR", method="alchemlyb", **kwargs): if data: return func(work_dir, estimator, method, **kwargs) - raise ValueError("Couldn't find any SOMD, GROMACS or AMBER free-energy output?") + raise ValueError( + "Couldn't find any SOMD, SOMD2, GROMACS or AMBER free-energy output?" + ) @staticmethod def checkOverlap(overlap, threshold=0.03): @@ -746,6 +756,7 @@ def _get_data(files, temperatures, engine, estimator): function_dict = { "SOMD": partial(Relative._somd_extract, estimator=estimator), + "SOMD2": partial(Relative._somd2_extract, estimator=estimator), "GROMACS": _gmx_extract_u_nk if is_mbar else _gmx_extract_dHdl, "AMBER": _amber_extract_u_nk if is_mbar else _amber_extract_dHdl, } @@ -1002,6 +1013,200 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): return df + @staticmethod + def _somd2_extract(parquet_file, T=None, estimator="MBAR"): + """ + Extract required data from a SOMD2 output file (parquet file). + + Parameters + ---------- + + parquet_file : str + Path to the parquet file. + + T : float + Temperature in Kelvin at which the simulations were performed; + needed to generated the reduced potential (in units of kT). + + estimator : str + The estimator that the returned data will be used with. This can + be either 'MBAR' or 'TI'. + + Returns + ------- + + data : pandas.DataFrame + Either: Reduced potential for each alchemical state (k) for each + frame (n) for MBAR, or dH/dl as a function of time for this lambda + window for TI. + """ + + if not isinstance(parquet_file, _pathlib.Path): + raise TypeError("'parquet_file' must be of type 'pathlib.Path'.") + if not _os.path.isfile(parquet_file): + raise ValueError("'parquet_file' doesn't exist!") + + if not isinstance(T, float): + raise TypeError("'T' must be of type 'float'.") + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + if estimator.replace(" ", "").upper() not in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + # Flag that we're using MBAR. + if estimator == "MBAR": + is_mbar = True + else: + is_mbar = False + + # Beta factor for computing reduced potentials and gradients. + k_b = _R_kJmol * _kJ2kcal + beta = 1 / (k_b * T) + + # Try to read the file. + try: + table = _pq.read_table(parquet_file) + except: + raise IOError(f"Could not read the SOMD2 parquet file: {parquet_file}") + + # Try to extract the metadata. + try: + metadata = _json.loads(table.schema.metadata["somd2".encode()]) + temperature = float(metadata["temperature"]) + except: + raise IOError( + f"Could not read the SOMD2 metadta from parquet file: {parquet_file}" + ) + + # Validate metadata required by all analysis methods. + try: + lam = float(metadata["lambda"]) + except: + raise ValueError("Parquet metadata does not contain lambda value.") + try: + lambda_array = metadata["lambda_array"] + except: + raise ValueError("Parquet metadata does not contain lambda array.") + + # Make sure that the temperature is correct. + if not T == temperature: + raise ValueError( + f"The temperature in the parquet metadata '{temperature:%.3f}' " + f"does not match the specified temperature '{T:%.3f}'." + ) + + # Convert to a pandas dataframe. + df = table.to_pandas() + + if is_mbar: + df = df[[str(i) for i in lambda_array]].copy() + lambdas = len(df.index) * [lam] + multiindex = _pd.MultiIndex.from_arrays( + [df.index, lambdas], names=["time", "lambdas"] + ) + df = _pd.concat( + [df.iloc[:, :2], beta * df.iloc[:, 2:].subtract(df[str(lam)], axis=0)], + axis=1, + ) + df.index = multiindex + + # Set the temperature and energy unit. + df.attrs["temperature"] = T + df.attrs["energy_unit"] = "kT" + + return df + + else: + columns_lambdas = df.columns[ + _pd.to_numeric(df.columns, errors="coerce").to_series().notnull() + ] + + if len(columns_lambdas) > 3 and lambda_array is None: + raise ValueError( + "More than 3 lambda values in the dataframe but no lambda array provided?" + ) + try: + lam_below = max( + [ + float(lambda_val) + for lambda_val in columns_lambdas + if float(lambda_val) < lam + ] + ) + except ValueError: + lam_below = None + try: + lam_above = min( + [ + float(lambda_val) + for lambda_val in columns_lambdas + if float(lambda_val) > lam + ] + ) + except ValueError: + lam_above = None + + # Compute gradient using finite differences. + if lam_below is None: + double_incr = (lam_above - lam) * 2 + grad = (df[str(lam_above)] - df[str(lam)]) * 2 / double_incr + back_m = _np.exp(beta * (df[str(lam_above)] - df[str(lam)])) + forward_m = _np.exp(-1 * beta * (df[str(lam_above)] - df[str(lam)])) + elif lam_above is None: + double_incr = (lam - lam_below) * 2 + grad = (df[str(lam)] - df[str(lam_below)]) * 2 / double_incr + back_m = _np.exp(-1 * beta * (df[str(lam_below)] - df[str(lam)])) + forward_m = _np.exp(beta * (df[str(lam_below)] - df[str(lam)])) + else: + double_incr = lam_above - lam_below + grad = (df[str(lam_above)] - df[str(lam_below)]) / double_incr + back_m = _np.exp(beta * (df[str(lam)] - df[str(lam_below)])) + forward_m = _np.exp(beta * (df[str(lam)] - df[str(lam_above)])) + + grad.name = "gradient" + back_m.name = "backward_mc" + forward_m.name = "forward_mc" + + if lambda_array is not None: + df[[str(i) for i in lambda_array]] = df[ + [str(i) for i in lambda_array] + ].apply(lambda x: x * -1 * beta) + + df = _pd.concat( + [ + df, + _pd.DataFrame(grad), + _pd.DataFrame(back_m), + _pd.DataFrame(forward_m), + ], + axis=1, + ) + + try: + time = list(df["time"]) + except KeyError: + time = list(df.index) + + lambdas = len(time) * [lam] + + # Create a multi-index from the two lists + multi_index = _pd.MultiIndex.from_tuples( + zip(time, lambdas), names=["time", "fep-lambdas"] + ) + + # Get the gradients. + grads = list(df["gradient"]) + + # Create a DataFrame with the multi-index + df = _pd.DataFrame({"fep": grads}, index=multi_index) + + # Set the temperature and energy unit. + df.attrs["temperature"] = T + df.attrs["energy_unit"] = "kT" + + return df + @staticmethod def _preprocess_data(data, estimator, **kwargs): """ @@ -1686,6 +1891,87 @@ def _analyse_somd(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs) return (data, _np.matrix(overlap)) + @staticmethod + def _analyse_somd2(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs): + """ + Analyse SOMD2 free energy data. + + Parameters + ---------- + + work_dir : str + The path to the working directory. + + estimator : str + The estimator ('MBAR' or 'TI') used. + + method : str + The method used to analyse the data. Options are "alchemlyb" or "native". + + Returns + ------- + + pmf : [(float, :class:`Energy `, :class:`Energy `)] + The potential of mean force (PMF). The data is a list of tuples, + where each tuple contains the lambda value, the PMF, and the + standard error. + + overlap or dHdl : numpy.matrix or alchemlyb.estimators.ti_.TI + For MBAR, this returns the overlap matrix for the overlap between + each lambda window. For TI, this returns None. + """ + + if not isinstance(work_dir, str): + raise TypeError("'work_dir' must be of type 'str'.") + if not _os.path.isdir(work_dir): + raise ValueError("'work_dir' doesn't exist!") + + if not isinstance(estimator, str): + raise TypeError("'estimator' must be of type 'str'.") + estimator = estimator.replace(" ", "").upper() + if estimator not in ["MBAR", "TI"]: + raise ValueError("'estimator' must be either 'MBAR' or 'TI'.") + + if not isinstance(method, str): + raise TypeError("'method' must be of type 'str'.") + method = method.replace(" ", "").upper() + if method != "ALCHEMLYB": + raise ValueError( + "Only 'alchemlyb' analysis 'method' is supported for SOMD2." + ) + + # Glob the data files. + glob_path = _pathlib.Path(work_dir) + files = sorted(glob_path.glob("**/*.parquet")) + + # Loop over each file and try to extract the metadata to work out + # the lambda value and temperature for each window. + + lambdas = [] + temperatures = [] + + for file in files: + try: + metadata = _json.loads( + _pq.read_metadata(file).metadata["somd2".encode()] + ) + lambdas.append(float(metadata["lambda"])) + temperatures.append(float(metadata["temperature"])) + except: + raise IOError(f"Unable to parse metadata from SOMD2 file: {file}") + + # Sort the lists based on the lambda values. + temperatures = [x for _, x in sorted(zip(lambdas, temperatures))] + lambdas = sorted(lambdas) + + # Check that the temperatures at the end states match. + if temperatures[0] != temperatures[-1]: + raise ValueError("The temperatures at the endstates don't match!") + + return Relative._analyse_internal( + files, temperatures, lambdas, "SOMD2", estimator, **kwargs + ) + def _checkOverlap(self, estimator="MBAR", threshold=0.03): """ Check the overlap of an FEP simulation leg. diff --git a/python/setup.py b/python/setup.py index ae3783e34..453781143 100644 --- a/python/setup.py +++ b/python/setup.py @@ -120,6 +120,7 @@ def clear_installed_list(): "openff-interchange-base", "openff-toolkit-base", "parmed", + "pyarrow", "py3dmol", "pydot", "pygtail", diff --git a/requirements.txt b/requirements.txt index 144346d7e..c949ad34f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ nglview openff-interchange-base openff-toolkit-base parmed +pyarrow py3dmol pydot pygtail From 6358a4b1df9bf9e019815c0609a81810f6c55f45 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Sat, 2 Sep 2023 12:14:32 +0100 Subject: [PATCH 35/50] Fix reduced potential calculation. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 27ff5f928..8bca3df14 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1105,11 +1105,8 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): multiindex = _pd.MultiIndex.from_arrays( [df.index, lambdas], names=["time", "lambdas"] ) - df = _pd.concat( - [df.iloc[:, :2], beta * df.iloc[:, 2:].subtract(df[str(lam)], axis=0)], - axis=1, - ) df.index = multiindex + df = beta * df.subtract(df[str(lam)], axis=0) # Set the temperature and energy unit. df.attrs["temperature"] = T From eba8da2f9af3eb21527700337ca18ed7b3400579 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Sat, 2 Sep 2023 13:30:00 +0100 Subject: [PATCH 36/50] Convert dataframe column index to float. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 8bca3df14..1e10cb6c9 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1100,7 +1100,7 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): df = table.to_pandas() if is_mbar: - df = df[[str(i) for i in lambda_array]].copy() + df = df[[str(x) for x in lambda_array]].copy() lambdas = len(df.index) * [lam] multiindex = _pd.MultiIndex.from_arrays( [df.index, lambdas], names=["time", "lambdas"] @@ -1112,6 +1112,9 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): df.attrs["temperature"] = T df.attrs["energy_unit"] = "kT" + # Convert column index to float. + df.columns = df.columns.astype(float) + return df else: From bd4a680163c44c6b6040c5e68d8e82aea5d6b954 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Sat, 2 Sep 2023 13:32:00 +0100 Subject: [PATCH 37/50] Drop NaN values from returned dataframes. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 1e10cb6c9..5061431c7 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1115,7 +1115,7 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): # Convert column index to float. df.columns = df.columns.astype(float) - return df + return df.dropna() else: columns_lambdas = df.columns[ @@ -1205,7 +1205,7 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): df.attrs["temperature"] = T df.attrs["energy_unit"] = "kT" - return df + return df.dropna() @staticmethod def _preprocess_data(data, estimator, **kwargs): From db24da9492afdc77ea884e3ad8fcde154796069e Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 14 Sep 2023 15:16:55 +0100 Subject: [PATCH 38/50] Allow user to compute delta G between single leg end states. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 95 +++++++++++++--------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 5061431c7..887ce7f95 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -604,10 +604,10 @@ def checkOverlap(overlap, threshold=0.03): return is_okay, num_low @staticmethod - def difference(pmf, pmf_ref): + def difference(pmf, pmf_ref=None): """ Compute the relative free-energy difference between two perturbation - legs. + legs, or between the end states of a single leg. Parameters ---------- @@ -631,7 +631,7 @@ def difference(pmf, pmf_ref): if not isinstance(pmf, list): raise TypeError("'pmf' must be of type 'list'.") - if not isinstance(pmf_ref, list): + if pmf_ref is not None and not isinstance(pmf_ref, list): raise TypeError("'pmf_ref' must be of type 'list'.") for rec in pmf: @@ -653,47 +653,62 @@ def difference(pmf, pmf_ref): "'pmf' must contain 'BioSimSpace.Types.Energy' types." ) - for rec in pmf_ref: - if not isinstance(rec, tuple): - raise TypeError( - "'pmf_ref' must contain tuples containing lambda " - "values and the associated free-energy and error." - ) - else: - if len(rec) != 3: - raise ValueError( - "Each tuple in 'pmf_ref' must contain three items: " - "a lambda value and the associated free energy " - "and error." + if pmf_ref is not None: + for rec in pmf_ref: + if not isinstance(rec, tuple): + raise TypeError( + "'pmf_ref' must contain tuples containing lambda " + "values and the associated free-energy and error." ) - for val in rec[1:]: - if not isinstance(val, _Types.Energy): - raise TypeError( - "'pmf_ref' must contain 'BioSimSpace.Types.Energy' types." + else: + if len(rec) != 3: + raise ValueError( + "Each tuple in 'pmf_ref' must contain three items: " + "a lambda value and the associated free energy " + "and error." ) + for val in rec[1:]: + if not isinstance(val, _Types.Energy): + raise TypeError( + "'pmf_ref' must contain 'BioSimSpace.Types.Energy' types." + ) - # Work out the difference in free energy. - free_energy = (pmf[-1][1] - pmf[0][1]) - (pmf_ref[-1][1] - pmf_ref[0][1]) + if pmf_ref is not None: + # Work out the difference in free energy. + free_energy = (pmf[-1][1] - pmf[0][1]) - (pmf_ref[-1][1] - pmf_ref[0][1]) - # Propagate the errors. (These add in quadrature.) + # Propagate the errors. (These add in quadrature.) - # Measure. - error0 = _math.sqrt( - (pmf[-1][2].value() * pmf[-1][2].value()) - + (pmf[0][2].value() * pmf[0][2].value()) - ) + # Measure. + error0 = _math.sqrt( + (pmf[-1][2].value() * pmf[-1][2].value()) + + (pmf[0][2].value() * pmf[0][2].value()) + ) - # Reference. - error1 = _math.sqrt( - (pmf_ref[-1][2].value() * pmf_ref[-1][2].value()) - + (pmf_ref[0][2].value() * pmf_ref[0][2].value()) - ) + # Reference. + error1 = _math.sqrt( + (pmf_ref[-1][2].value() * pmf_ref[-1][2].value()) + + (pmf_ref[0][2].value() * pmf_ref[0][2].value()) + ) - # Error for free-energy difference. - error = ( - _math.sqrt((error0 * error0) + (error1 * error1)) - * _Units.Energy.kcal_per_mol - ) + # Error for free-energy difference. + error = ( + _math.sqrt((error0 * error0) + (error1 * error1)) + * _Units.Energy.kcal_per_mol + ) + + else: + # Work out the difference in free energy. + free_energy = pmf[-1][1] - pmf[0][1] + + # Work out the error. + error = ( + _math.sqrt( + (pmf[-1][2].value() * pmf[-1][2].value()) + + (pmf[0][2].value() * pmf[0][2].value()) + ) + * _Units.Energy.kcal_per_mol + ) return (free_energy, error) @@ -2004,10 +2019,10 @@ def _checkOverlap(self, estimator="MBAR", threshold=0.03): else: raise ValueError("Overlap matrix isn't supported for this estimator.") - def _difference(self, pmf_ref): + def _difference(self, pmf_ref=None): """ Compute the relative free-energy difference between two perturbation - legs. + legs, or between the end states of a single leg. Parameters ---------- @@ -2028,7 +2043,7 @@ def _difference(self, pmf_ref): pmf, _ = self.analyse() # Now call the staticmethod passing in both PMFs. - return Relative.difference(pmf, pmf_ref) + return Relative.difference(pmf, pmf_ref=pmf_ref) def _initialise_runner(self, system): """ From 41a132a1ea3cb4d9652349e182bc395ccf25aa48 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 14 Sep 2023 15:20:26 +0100 Subject: [PATCH 39/50] Fix use of wrong error variable. [ci skip] --- python/BioSimSpace/Notebook/_plot.py | 2 +- python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/Notebook/_plot.py b/python/BioSimSpace/Notebook/_plot.py index fdcb470ef..7d6293f8e 100644 --- a/python/BioSimSpace/Notebook/_plot.py +++ b/python/BioSimSpace/Notebook/_plot.py @@ -234,7 +234,7 @@ def plot( if xerr is not None: xerr = [xerr[i] for i in idx] if yerr is not None: - yerr = [xerr[i] for i in idx] + yerr = [yerr[i] for i in idx] # y-dimension idx = [i for i, v in enumerate(y) if v is not None] diff --git a/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py b/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py index fdcb470ef..7d6293f8e 100644 --- a/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py +++ b/python/BioSimSpace/Sandpit/Exscientia/Notebook/_plot.py @@ -234,7 +234,7 @@ def plot( if xerr is not None: xerr = [xerr[i] for i in idx] if yerr is not None: - yerr = [xerr[i] for i in idx] + yerr = [yerr[i] for i in idx] # y-dimension idx = [i for i, v in enumerate(y) if v is not None] From 33573673c809d9d8fb7115c9d215b9113311ae25 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 18 Sep 2023 15:19:25 +0100 Subject: [PATCH 40/50] Simplify expresssion. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 887ce7f95..0b996abdd 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1167,11 +1167,11 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): double_incr = (lam_above - lam) * 2 grad = (df[str(lam_above)] - df[str(lam)]) * 2 / double_incr back_m = _np.exp(beta * (df[str(lam_above)] - df[str(lam)])) - forward_m = _np.exp(-1 * beta * (df[str(lam_above)] - df[str(lam)])) + forward_m = _np.exp(-beta * (df[str(lam_above)] - df[str(lam)])) elif lam_above is None: double_incr = (lam - lam_below) * 2 grad = (df[str(lam)] - df[str(lam_below)]) * 2 / double_incr - back_m = _np.exp(-1 * beta * (df[str(lam_below)] - df[str(lam)])) + back_m = _np.exp(-beta * (df[str(lam_below)] - df[str(lam)])) forward_m = _np.exp(beta * (df[str(lam_below)] - df[str(lam)])) else: double_incr = lam_above - lam_below @@ -1186,7 +1186,7 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): if lambda_array is not None: df[[str(i) for i in lambda_array]] = df[ [str(i) for i in lambda_array] - ].apply(lambda x: x * -1 * beta) + ].apply(lambda x: x * -beta) df = _pd.concat( [ From 12c28069fe096a518feef12b0df3b61d59d00dcd Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 18 Sep 2023 16:08:56 +0100 Subject: [PATCH 41/50] Fix gradient unit for SOMD TI analysis. --- python/BioSimSpace/FreeEnergy/_relative.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 0b996abdd..777c2dd62 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -979,8 +979,8 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): # TODO: get header from the file instead of like this. header = [ "step", - "potential_kcal/mol", - "gradient_kcal/mol", + "potential", + "gradient", "forward_Metropolis", "backward_Metropolis", ] @@ -1002,8 +1002,9 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): file_df.iloc[:, 5:].subtract(file_df[lambda_win], axis=0).to_numpy() ) else: - gradient = file_df["gradient_kcal/mol"].to_numpy() - results = gradient * beta + # This is actually in units of kT, but is reported incorrectly in + # the file originally written by SOMD. + results = file_df["gradient"].to_numpy() # Turn into a dataframe that can be processed by alchemlyb. if is_mbar: From 3786be32203aebc314f506d1baf6aaf1c7c5954b Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 18 Sep 2023 16:09:14 +0100 Subject: [PATCH 42/50] Remove redundant code and fix gradient units for SOMD2 TI analysis. --- python/BioSimSpace/FreeEnergy/_relative.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 777c2dd62..f938f4bb5 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1184,11 +1184,6 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): back_m.name = "backward_mc" forward_m.name = "forward_mc" - if lambda_array is not None: - df[[str(i) for i in lambda_array]] = df[ - [str(i) for i in lambda_array] - ].apply(lambda x: x * -beta) - df = _pd.concat( [ df, @@ -1211,8 +1206,8 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): zip(time, lambdas), names=["time", "fep-lambdas"] ) - # Get the gradients. - grads = list(df["gradient"]) + # Get the gradients in kT. + grads = list(beta * df["gradient"]) # Create a DataFrame with the multi-index df = _pd.DataFrame({"fep": grads}, index=multi_index) From 4a1f7eaa0c207570efe3fae1f5216132a45b1f9d Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Mon, 18 Sep 2023 16:13:25 +0100 Subject: [PATCH 43/50] Update expected TI result in test. --- tests/FreeEnergy/test_relative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/FreeEnergy/test_relative.py b/tests/FreeEnergy/test_relative.py index fc82f92d1..f93f49e61 100644 --- a/tests/FreeEnergy/test_relative.py +++ b/tests/FreeEnergy/test_relative.py @@ -54,7 +54,7 @@ def expected_results(): """A dictionary of expected FEP results.""" return { - "somd": {"mbar": -6.3519, "ti": -10.6027}, + "somd": {"mbar": -6.3519, "ti": -6.3209}, "gromacs": {"mbar": -6.0238, "ti": -8.4158}, } From 110fd0435bb37ddd00b05030696060c4596ef3c8 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 26 Sep 2023 13:17:51 +0100 Subject: [PATCH 44/50] Simplify SOMD2 analysis following to_alchemlyb Pandas conversion. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 86 +++++++--------------- 1 file changed, 27 insertions(+), 59 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index f938f4bb5..5a37312d8 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1012,7 +1012,8 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): results, columns=_np.array(lambda_array, dtype=_np.float64), index=_pd.MultiIndex.from_arrays( - [time, _np.repeat(lambda_win, len(time))], names=["time", "lambdas"] + [time, _np.repeat(lambda_win, len(time))], + names=["time", "fep-lambda"], ), ) else: @@ -1021,7 +1022,7 @@ def _somd_extract(simfile, T=None, estimator="MBAR"): columns=["fep"], index=_pd.MultiIndex.from_arrays( [time, _np.repeat(lambda_win, len(time))], - names=["time", "fep-lambdas"], + names=["time", "fep-lambda"], ), ) df.attrs["temperature"] = T @@ -1076,10 +1077,6 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): else: is_mbar = False - # Beta factor for computing reduced potentials and gradients. - k_b = _R_kJmol * _kJ2kcal - beta = 1 / (k_b * T) - # Try to read the file. try: table = _pq.read_table(parquet_file) @@ -1089,21 +1086,28 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): # Try to extract the metadata. try: metadata = _json.loads(table.schema.metadata["somd2".encode()]) - temperature = float(metadata["temperature"]) except: raise IOError( f"Could not read the SOMD2 metadta from parquet file: {parquet_file}" ) # Validate metadata required by all analysis methods. + try: + temperature = float(metadata["temperature"]) + except: + raise ValueError("Parquet metadata does not contain 'temperature'.") + try: + attrs = dict(metadata["attrs"]) + except: + raise ValueError("Parquet metadata does not contain 'attrs'.") try: lam = float(metadata["lambda"]) except: - raise ValueError("Parquet metadata does not contain lambda value.") + raise ValueError("Parquet metadata does not contain 'lambda'.") try: lambda_array = metadata["lambda_array"] except: - raise ValueError("Parquet metadata does not contain lambda array.") + raise ValueError("Parquet metadata does not contain 'lambda array'") # Make sure that the temperature is correct. if not T == temperature: @@ -1116,20 +1120,14 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): df = table.to_pandas() if is_mbar: - df = df[[str(x) for x in lambda_array]].copy() - lambdas = len(df.index) * [lam] - multiindex = _pd.MultiIndex.from_arrays( - [df.index, lambdas], names=["time", "lambdas"] - ) - df.index = multiindex - df = beta * df.subtract(df[str(lam)], axis=0) + # Extract the columns correspodning to the lambda array. + df = df[[str(x) for x in lambda_array]] - # Set the temperature and energy unit. - df.attrs["temperature"] = T - df.attrs["energy_unit"] = "kT" + # Subtract the potential at the simulated lambda. + df = df.subtract(df[str(lam)], axis=0) - # Convert column index to float. - df.columns = df.columns.astype(float) + # Apply the existing attributes. + df.attrs = attrs return df.dropna() @@ -1167,55 +1165,25 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): if lam_below is None: double_incr = (lam_above - lam) * 2 grad = (df[str(lam_above)] - df[str(lam)]) * 2 / double_incr - back_m = _np.exp(beta * (df[str(lam_above)] - df[str(lam)])) - forward_m = _np.exp(-beta * (df[str(lam_above)] - df[str(lam)])) elif lam_above is None: double_incr = (lam - lam_below) * 2 grad = (df[str(lam)] - df[str(lam_below)]) * 2 / double_incr - back_m = _np.exp(-beta * (df[str(lam_below)] - df[str(lam)])) - forward_m = _np.exp(beta * (df[str(lam_below)] - df[str(lam)])) else: double_incr = lam_above - lam_below grad = (df[str(lam_above)] - df[str(lam_below)]) / double_incr - back_m = _np.exp(beta * (df[str(lam)] - df[str(lam_below)])) - forward_m = _np.exp(beta * (df[str(lam)] - df[str(lam_above)])) - - grad.name = "gradient" - back_m.name = "backward_mc" - forward_m.name = "forward_mc" - - df = _pd.concat( - [ - df, - _pd.DataFrame(grad), - _pd.DataFrame(back_m), - _pd.DataFrame(forward_m), - ], - axis=1, - ) - try: - time = list(df["time"]) - except KeyError: - time = list(df.index) + # Create a DataFrame with the multi-index + df = _pd.DataFrame({"fep": grad}, index=df.index) - lambdas = len(time) * [lam] + # Set the original attributes. + df.attrs = attrs - # Create a multi-index from the two lists - multi_index = _pd.MultiIndex.from_tuples( - zip(time, lambdas), names=["time", "fep-lambdas"] + # Set the index, ensuring that fep-lambda is a float. + idx = df.index + df.index = _pd.MultiIndex( + [idx.levels[0], idx.levels[1].astype(float)], idx.codes, names=idx.names ) - # Get the gradients in kT. - grads = list(beta * df["gradient"]) - - # Create a DataFrame with the multi-index - df = _pd.DataFrame({"fep": grads}, index=multi_index) - - # Set the temperature and energy unit. - df.attrs["temperature"] = T - df.attrs["energy_unit"] = "kT" - return df.dropna() @staticmethod From f691d4c7df7340abe071616ed0792c39961329f2 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Tue, 26 Sep 2023 14:33:44 +0100 Subject: [PATCH 45/50] Simplify TI gradient calculation. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 55 ++++++++-------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 5a37312d8..0c16f2577 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1108,6 +1108,11 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): lambda_array = metadata["lambda_array"] except: raise ValueError("Parquet metadata does not contain 'lambda array'") + if not is_mbar: + try: + lambda_grad = metadata["lambda_grad"] + except: + raise ValueError("Parquet metadata does not contain 'lambda grad'") # Make sure that the temperature is correct. if not T == temperature: @@ -1132,43 +1137,23 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): return df.dropna() else: - columns_lambdas = df.columns[ - _pd.to_numeric(df.columns, errors="coerce").to_series().notnull() - ] + # Forward or backward difference. + if len(lambda_grad) == 1: + lam_delta = lambda_grad[0] - if len(columns_lambdas) > 3 and lambda_array is None: - raise ValueError( - "More than 3 lambda values in the dataframe but no lambda array provided?" - ) - try: - lam_below = max( - [ - float(lambda_val) - for lambda_val in columns_lambdas - if float(lambda_val) < lam - ] - ) - except ValueError: - lam_below = None - try: - lam_above = min( - [ - float(lambda_val) - for lambda_val in columns_lambdas - if float(lambda_val) > lam - ] - ) - except ValueError: - lam_above = None - - # Compute gradient using finite differences. - if lam_below is None: - double_incr = (lam_above - lam) * 2 - grad = (df[str(lam_above)] - df[str(lam)]) * 2 / double_incr - elif lam_above is None: - double_incr = (lam - lam_below) * 2 - grad = (df[str(lam)] - df[str(lam_below)]) * 2 / double_incr + # Forward difference. + if lam_delta > lam: + double_incr = (lam_delta - lam) * 2 + grad = (df[str(lam_delta)] - df[str(lam)]) * 2 / double_incr + + # Backward difference. + else: + double_incr = (lam - lam_delta) * 2 + grad = (df[str(lam)] - df[str(lam_delta)]) * 2 / double_incr + + # Central difference. else: + lam_below, lam_above = lambda_grad double_incr = lam_above - lam_below grad = (df[str(lam_above)] - df[str(lam_below)]) / double_incr From 8debf89720b326c8bf4cad6c96ac3d10c9772324 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Wed, 27 Sep 2023 09:27:24 +0100 Subject: [PATCH 46/50] Sire DataFrames now correctly use floats for fep-lamda and columns. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 0c16f2577..391fbaa2e 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1126,10 +1126,10 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): if is_mbar: # Extract the columns correspodning to the lambda array. - df = df[[str(x) for x in lambda_array]] + df = df[[x for x in lambda_array]] # Subtract the potential at the simulated lambda. - df = df.subtract(df[str(lam)], axis=0) + df = df.subtract(df[lam], axis=0) # Apply the existing attributes. df.attrs = attrs @@ -1144,18 +1144,18 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): # Forward difference. if lam_delta > lam: double_incr = (lam_delta - lam) * 2 - grad = (df[str(lam_delta)] - df[str(lam)]) * 2 / double_incr + grad = (df[lam_delta] - df[lam]) * 2 / double_incr # Backward difference. else: double_incr = (lam - lam_delta) * 2 - grad = (df[str(lam)] - df[str(lam_delta)]) * 2 / double_incr + grad = (df[lam] - df[lam_delta]) * 2 / double_incr # Central difference. else: lam_below, lam_above = lambda_grad double_incr = lam_above - lam_below - grad = (df[str(lam_above)] - df[str(lam_below)]) / double_incr + grad = (df[lam_above] - df[lam_below]) / double_incr # Create a DataFrame with the multi-index df = _pd.DataFrame({"fep": grad}, index=df.index) @@ -1163,12 +1163,6 @@ def _somd2_extract(parquet_file, T=None, estimator="MBAR"): # Set the original attributes. df.attrs = attrs - # Set the index, ensuring that fep-lambda is a float. - idx = df.index - df.index = _pd.MultiIndex( - [idx.levels[0], idx.levels[1].astype(float)], idx.codes, names=idx.names - ) - return df.dropna() @staticmethod From 015c6abb741faf4aa24bbea6b26362a4f9931626 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Fri, 29 Sep 2023 20:39:57 +0100 Subject: [PATCH 47/50] Remove redundant imports. [ci skip] --- python/BioSimSpace/Align/_align.py | 3 --- python/BioSimSpace/Sandpit/Exscientia/Align/_align.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/python/BioSimSpace/Align/_align.py b/python/BioSimSpace/Align/_align.py index aa1258bd3..443ed1d9e 100644 --- a/python/BioSimSpace/Align/_align.py +++ b/python/BioSimSpace/Align/_align.py @@ -36,9 +36,6 @@ import csv as _csv import os as _os import subprocess as _subprocess -import shutil as _shutil -import shlex as _shlex -import sys as _sys from .._Utils import _try_import, _have_imported, _assert_imported diff --git a/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py b/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py index 74c8a41e0..d3efaf0b6 100644 --- a/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py +++ b/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py @@ -36,9 +36,6 @@ import csv as _csv import os as _os import subprocess as _subprocess -import shutil as _shutil -import shlex as _shlex -import sys as _sys from .._Utils import _try_import, _have_imported, _assert_imported From 2782ab9771c2b7eb462392da437dc2fdfdb56f65 Mon Sep 17 00:00:00 2001 From: anna <83006586+annamherz@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:32:48 +0000 Subject: [PATCH 48/50] review small fixes --- python/BioSimSpace/FreeEnergy/_relative.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 391fbaa2e..0638bfa6a 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1239,8 +1239,8 @@ def _preprocess_data(data, estimator, **kwargs): else: data = raw_data - # If using auto equilibration, do we want to remove burn in? - # Perform a statistical inefficiency check afterwards. + # The decorrelate function calls either autoequilibration or statistical_inefficiency in alchemlyb, which both subsample the data. + # As such, auto equilibration (remove_burnin) can only be carried out if statistical inefficency detection is also carried out. if stat_ineff: if estimator == "MBAR": decorrelated_data = [ @@ -1449,7 +1449,7 @@ def _analyse_amber(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs each lambda window. For TI, this returns None. """ - if type(work_dir) is not str: + if not isinstance(work_dir, str): raise TypeError("'work_dir' must be of type 'str'.") if not _os.path.isdir(work_dir): raise ValueError("'work_dir' doesn't exist!") @@ -1490,7 +1490,7 @@ def _analyse_amber(work_dir=None, estimator="MBAR", method="alchemlyb", **kwargs if not found_temperature: raise ValueError( - "The temperature was not detected in the AMBER output file." + f"The temperature was not detected in the AMBER output file: {file}." ) if temperatures[0] != temperatures[-1]: From 44cd871c25ba27804044330a1ae84e5da0667eab Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Wed, 11 Oct 2023 20:15:02 +0100 Subject: [PATCH 49/50] Formatting tweak. [ci skip] --- python/BioSimSpace/FreeEnergy/_relative.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/FreeEnergy/_relative.py b/python/BioSimSpace/FreeEnergy/_relative.py index 0638bfa6a..2b0f11096 100644 --- a/python/BioSimSpace/FreeEnergy/_relative.py +++ b/python/BioSimSpace/FreeEnergy/_relative.py @@ -1239,8 +1239,9 @@ def _preprocess_data(data, estimator, **kwargs): else: data = raw_data - # The decorrelate function calls either autoequilibration or statistical_inefficiency in alchemlyb, which both subsample the data. - # As such, auto equilibration (remove_burnin) can only be carried out if statistical inefficency detection is also carried out. + # The decorrelate function calls either autoequilibration or statistical_inefficiency + # in alchemlyb, which both subsample the data. As such, auto equilibration (remove_burnin) + # can only be carried out if statistical inefficency detection is also carried out. if stat_ineff: if estimator == "MBAR": decorrelated_data = [ From 7dd1dae65a41e23a1102903aff98f6230abacf48 Mon Sep 17 00:00:00 2001 From: Lester Hedges Date: Thu, 12 Oct 2023 14:41:04 +0100 Subject: [PATCH 50/50] Skip Sire MCS fallback on Windows since currently unsupported. --- python/BioSimSpace/Align/_align.py | 11 ++++++++++- python/BioSimSpace/Sandpit/Exscientia/Align/_align.py | 11 ++++++++++- tests/Align/test_align.py | 4 ++++ tests/Sandpit/Exscientia/Align/test_align.py | 4 ++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/BioSimSpace/Align/_align.py b/python/BioSimSpace/Align/_align.py index 443ed1d9e..cd0bb9a2c 100644 --- a/python/BioSimSpace/Align/_align.py +++ b/python/BioSimSpace/Align/_align.py @@ -36,6 +36,7 @@ import csv as _csv import os as _os import subprocess as _subprocess +import sys as _sys from .._Utils import _try_import, _have_imported, _assert_imported @@ -922,7 +923,7 @@ def matchAtoms( mcs_smarts = _Chem.MolFromSmarts(mcs.smartsString) except: - raise RuntimeError("RDKIT MCS mapping failed!") + raise RuntimeError("RDKit MCS mapping failed!") # Score the mappings and return them in sorted order (best to worst). mappings, scores = _score_rdkit_mappings( @@ -945,6 +946,14 @@ def matchAtoms( if prematch != {}: _warnings.warn("RDKit mapping didn't include prematch. Using Sire MCS.") + # Sire MCS isn't currently supported on Windows. + if _sys.platform == "win32": + _warnings.warn("Sire MCS is currently unsupported on Windows.") + if return_scores: + return {}, [] + else: + return {} + # Warn about unsupported options. if not complete_rings_only: _warnings.warn( diff --git a/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py b/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py index d3efaf0b6..1ae4ffa92 100644 --- a/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py +++ b/python/BioSimSpace/Sandpit/Exscientia/Align/_align.py @@ -36,6 +36,7 @@ import csv as _csv import os as _os import subprocess as _subprocess +import sys as _sys from .._Utils import _try_import, _have_imported, _assert_imported @@ -943,7 +944,7 @@ def matchAtoms( mcs_smarts = _Chem.MolFromSmarts(mcs.smartsString) except: - raise RuntimeError("RDKIT MCS mapping failed!") + raise RuntimeError("RDKit MCS mapping failed!") # Score the mappings and return them in sorted order (best to worst). mappings, scores = _score_rdkit_mappings( @@ -966,6 +967,14 @@ def matchAtoms( if prematch != {}: _warnings.warn("RDKit mapping didn't include prematch. Using Sire MCS.") + # Sire MCS isn't currently supported on Windows. + if _sys.platform == "win32": + _warnings.warn("Sire MCS is currently unsupported on Windows.") + if return_scores: + return {}, [] + else: + return {} + # Warn about unsupported options. if not complete_rings_only: _warnings.warn( diff --git a/tests/Align/test_align.py b/tests/Align/test_align.py index 6325bed94..ab022a8e4 100644 --- a/tests/Align/test_align.py +++ b/tests/Align/test_align.py @@ -1,4 +1,5 @@ import pytest +import sys from sire.legacy.MM import InternalFF, IntraCLJFF, IntraFF from sire.legacy.Mol import AtomIdx, Element, PartialMolecule @@ -39,6 +40,9 @@ def test_flex_align(system0, system1): # Parameterise the function with a set of valid atom pre-matches. +@pytest.mark.skipif( + sys.platform == "win32", reason="Sire MCS currently not supported on Windows" +) @pytest.mark.parametrize("prematch", [{3: 1}, {5: 9}, {4: 5}, {1: 0}]) def test_prematch(system0, system1, prematch): # Extract the molecules. diff --git a/tests/Sandpit/Exscientia/Align/test_align.py b/tests/Sandpit/Exscientia/Align/test_align.py index 5b1a3b408..d493b1624 100644 --- a/tests/Sandpit/Exscientia/Align/test_align.py +++ b/tests/Sandpit/Exscientia/Align/test_align.py @@ -1,4 +1,5 @@ import pytest +import sys from sire.legacy.MM import InternalFF, IntraCLJFF, IntraFF from sire.legacy.Mol import AtomIdx, Element, PartialMolecule @@ -39,6 +40,9 @@ def test_flex_align(system0, system1): # Parameterise the function with a set of valid atom pre-matches. +@pytest.mark.skipif( + sys.platform == "win32", reason="Sire MCS currently not supported on Windows" +) @pytest.mark.parametrize("prematch", [{3: 1}, {5: 9}, {4: 5}, {1: 0}]) def test_prematch(system0, system1, prematch): # Extract the molecules.