From ec7437762580104507ae9e5e2b7e1966dee9a1fe Mon Sep 17 00:00:00 2001 From: Ian Watson Date: Wed, 6 Nov 2024 16:14:52 -0500 Subject: [PATCH] rf models --- contrib/bin/.rf_evaluate.sh | 7 +++++++ contrib/bin/xgbd/rf_evaluate.rb | 19 ++++++++++--------- docs/QSAR_Models/README.md | 27 +++++++++++++++++++++------ 3 files changed, 38 insertions(+), 15 deletions(-) create mode 100755 contrib/bin/.rf_evaluate.sh diff --git a/contrib/bin/.rf_evaluate.sh b/contrib/bin/.rf_evaluate.sh new file mode 100755 index 00000000..c42b7704 --- /dev/null +++ b/contrib/bin/.rf_evaluate.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [[ ! -v LILLYMOL_HOME ]] ; then + export LILLYMOL_HOME=$(dirname $0)/../.. +fi + +exec python ${LILLYMOL_HOME}/contrib/bin/xgbd/rf_evaluate.py "$@" diff --git a/contrib/bin/xgbd/rf_evaluate.rb b/contrib/bin/xgbd/rf_evaluate.rb index 0ca68b43..00427099 100644 --- a/contrib/bin/xgbd/rf_evaluate.rb +++ b/contrib/bin/xgbd/rf_evaluate.rb @@ -3,13 +3,14 @@ # Evaluate an RF model from either smiles or a descriptor file. require 'set' +require 'tempfile' require 'google/protobuf' -c3tk_home = ENV['C3TK_HOME'] +c3tk_home = ENV['LILLYMOL_HOME'] raise 'C3TK_HOME not defined' unless c3tk_home -require "#{c3tk_home}/bin/ruby/lib/iwcmdline" -require "#{c3tk_home}/bin/py/pytk/xgbd/random_forest_model_pb" +require "#{c3tk_home}contrib/bin/lib/iwcmdline" +require "#{c3tk_home}/contrib/bin/xgbd/random_forest_model_pb" def usage msg = <<-END @@ -57,27 +58,27 @@ def rf_evaluate_smiles(fname, mdir, proto, cl) cmd << " -j #{j}" end - tmpfile = File.join(ENV['TMPDIR'], "rf_evaluate_smiles_#{Process.uid}.#{Process.pid}.dat") + tmpfile = Tempfile.new("rf_evaluate_smiles_#{Process.uid}.#{Process.pid}.dat") descriptors.each do |d| cmd << " -#{d}" end - cmd << " #{fname} > #{tmpfile}" + cmd << " #{fname} > #{tmpfile.path}" $stderr << "Executing #{cmd}\n" if cl.option_present('v') system(cmd) - unless File.size?(tmpfile) + unless File.size?(tmpfile.path) $stderr << "#{cmd} failed\n" return end - rf_evaluate_descriptors(tmpfile, mdir, proto, cl) + rf_evaluate_descriptors(tmpfile.path, mdir, proto, cl) - File.unlink(tmpfile) + tmpfile.unlink end def rf_evaluate_descriptors(fname, mdir, proto, cl) - cmd = "rf_evaluate.sh -mdir #{mdir} #{fname}" + cmd = ".rf_evaluate.sh -mdir #{mdir} #{fname}" $stderr << "Executing #{cmd}\n" if cl.option_present('v') system(cmd) diff --git a/docs/QSAR_Models/README.md b/docs/QSAR_Models/README.md index a88e084d..a283d184 100644 --- a/docs/QSAR_Models/README.md +++ b/docs/QSAR_Models/README.md @@ -7,7 +7,7 @@ or parallel computing environments is straightforward. Model evaluation speed can be very fast. The model types included here are all molecular descriptor models -that make use of the (make_descriptors)[../../Molecule_Tools/make_descriptors.md] +that make use of the [make_descriptors](../../Molecule_Tools/make_descriptors.md) script to convert molecules to descriptor forms. This script supports a variety of 2D and 3D descriptor sets, all computed using LillyMol executables. @@ -47,12 +47,12 @@ where id '1' has activity 2.6, etc... A conformant molecular descriptor file can be generated by make_descriptors. ## XGBOOST -In the (contrib/bin/xgbd)[../../contrib/bin/xgbd] directory there +In the [contrib/bin/xgbd](../../contrib/bin/xgbd) directory there are tools that enable building and scoring an XGBoost model built on molecular descriptors. ### Model Building -The first step is to build a model. A minimal model, using a quick to compute +The first step is to build a model. A minimal model, using a quick-to-compute set of descriptors and taking all default XGBoost parameters, can be done via ``` ${LILLYMOL_HOME}/contrib/bin/make_descriptors.rb -w train.smi > train.w @@ -75,7 +75,7 @@ learning process [xgboost](https://xgboost.readthedocs.io/en/stable/parameter.ht The defaults in the script have been found to be generally good when dealing with a variety of SAQR datasets. Many of these hyperparameters can be set either by command line options, or as -a textproto specifying values for the [proto](src/./xgboost/xgboost_model.proto) +a textproto file specifying values for the [proto](/src//xgboost/xgboost_model.proto) and providing that file to the script via the --proto option. ## Scoring @@ -92,7 +92,7 @@ used during training and will call make_descriptors in order to score the test set. Note too that here the underlying script is ruby, which calls a LillyMol -c++ executable for scoring. +C++ executable for scoring. One slightly unfortunate usability feature is that xgbd_make is a python script that uses absl for argument parsing, so -- type options are recognised. The @@ -108,7 +108,7 @@ generate_smiles ... | xgbd_evaluate.sh -mdir MODEL -smi - although this only works well for a single descriptor computation. There are options in some tools to allow pipelined evaluation which does enable -fully pipelined scoring. +fully pipelined scoring of arbitrarily large sets of molecules. Model performance can be evaluated via ``` @@ -117,3 +117,18 @@ iwstats -E test.activity -p 2 test.pred or using any tool that can assess model performance. +## Random Forest +Very similar to xgbd_make.sh and xgbd_evaluate.sh the scripts rf_make.sh +and rf_evaluate.sh build and evaluate random forest models. The options +and functionality are very similar to the xgboost equivalents. Hyperparameters +can be specified via a [random_forest_model](/src/xgboost/random_forest_model.proto] +textproto. + +Generally we find that Random Forest models are slower to build, slower to score +and usually less accurate than xgboost models. + +## Other Model Types +Given the framework established with XGBoost and Random Forest models, it +is relatively straightforward to enable other model building algorithms. +Having a consistent interface makes integration with distributed evaluation +and processing straightforward.