From ec7437762580104507ae9e5e2b7e1966dee9a1fe Mon Sep 17 00:00:00 2001
From: Ian Watson <ianiwatson@gmail.com>
Date: Wed, 6 Nov 2024 16:14:52 -0500
Subject: [PATCH] rf models

---
 contrib/bin/.rf_evaluate.sh     |  7 +++++++
 contrib/bin/xgbd/rf_evaluate.rb | 19 ++++++++++---------
 docs/QSAR_Models/README.md      | 27 +++++++++++++++++++++------
 3 files changed, 38 insertions(+), 15 deletions(-)
 create mode 100755 contrib/bin/.rf_evaluate.sh

diff --git a/contrib/bin/.rf_evaluate.sh b/contrib/bin/.rf_evaluate.sh
new file mode 100755
index 00000000..c42b7704
--- /dev/null
+++ b/contrib/bin/.rf_evaluate.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [[ ! -v LILLYMOL_HOME ]] ; then
+  export LILLYMOL_HOME=$(dirname $0)/../..
+fi
+
+exec python ${LILLYMOL_HOME}/contrib/bin/xgbd/rf_evaluate.py "$@"
diff --git a/contrib/bin/xgbd/rf_evaluate.rb b/contrib/bin/xgbd/rf_evaluate.rb
index 0ca68b43..00427099 100644
--- a/contrib/bin/xgbd/rf_evaluate.rb
+++ b/contrib/bin/xgbd/rf_evaluate.rb
@@ -3,13 +3,14 @@
 # Evaluate an RF model from either smiles or a descriptor file.
 
 require 'set'
+require 'tempfile'
 require 'google/protobuf'
 
-c3tk_home = ENV['C3TK_HOME']
+c3tk_home = ENV['LILLYMOL_HOME']
 raise 'C3TK_HOME not defined' unless c3tk_home
 
-require "#{c3tk_home}/bin/ruby/lib/iwcmdline"
-require "#{c3tk_home}/bin/py/pytk/xgbd/random_forest_model_pb"
+require "#{c3tk_home}contrib/bin/lib/iwcmdline"
+require "#{c3tk_home}/contrib/bin/xgbd/random_forest_model_pb"
 
 def usage
 msg = <<-END
@@ -57,27 +58,27 @@ def rf_evaluate_smiles(fname, mdir, proto, cl)
     cmd << " -j #{j}"
   end
 
-  tmpfile = File.join(ENV['TMPDIR'], "rf_evaluate_smiles_#{Process.uid}.#{Process.pid}.dat")
+  tmpfile = Tempfile.new("rf_evaluate_smiles_#{Process.uid}.#{Process.pid}.dat")
 
   descriptors.each do |d|
     cmd << " -#{d}"
   end
-  cmd << " #{fname} > #{tmpfile}"
+  cmd << " #{fname} > #{tmpfile.path}"
   $stderr << "Executing #{cmd}\n" if cl.option_present('v')
 
   system(cmd)
-  unless File.size?(tmpfile)
+  unless File.size?(tmpfile.path)
     $stderr << "#{cmd} failed\n"
     return
   end
 
-  rf_evaluate_descriptors(tmpfile, mdir, proto, cl)
+  rf_evaluate_descriptors(tmpfile.path, mdir, proto, cl)
 
-  File.unlink(tmpfile)
+  tmpfile.unlink
 end
 
 def rf_evaluate_descriptors(fname, mdir, proto, cl)
-  cmd = "rf_evaluate.sh -mdir #{mdir} #{fname}"
+  cmd = ".rf_evaluate.sh -mdir #{mdir} #{fname}"
 
   $stderr << "Executing #{cmd}\n" if cl.option_present('v')
   system(cmd)
diff --git a/docs/QSAR_Models/README.md b/docs/QSAR_Models/README.md
index a88e084d..a283d184 100644
--- a/docs/QSAR_Models/README.md
+++ b/docs/QSAR_Models/README.md
@@ -7,7 +7,7 @@ or parallel computing environments is straightforward. Model evaluation
 speed can be very fast.
 
 The model types included here are all molecular descriptor models
-that make use of the (make_descriptors)[../../Molecule_Tools/make_descriptors.md]
+that make use of the [make_descriptors](../../Molecule_Tools/make_descriptors.md)
 script to convert molecules to descriptor forms. This script supports a variety
 of 2D and 3D descriptor sets, all computed using LillyMol executables.
 
@@ -47,12 +47,12 @@ where id '1' has activity 2.6, etc...
 A conformant molecular descriptor file can be generated by make_descriptors.
 
 ## XGBOOST
-In the (contrib/bin/xgbd)[../../contrib/bin/xgbd] directory there
+In the [contrib/bin/xgbd](../../contrib/bin/xgbd) directory there
 are tools that enable building and scoring an XGBoost model built on molecular
 descriptors.
 
 ### Model Building
-The first step is to build a model. A minimal model, using a quick to compute
+The first step is to build a model. A minimal model, using a quick-to-compute
 set of descriptors and taking all default XGBoost parameters, can be done via
 ```
 ${LILLYMOL_HOME}/contrib/bin/make_descriptors.rb -w train.smi > train.w
@@ -75,7 +75,7 @@ learning process [xgboost](https://xgboost.readthedocs.io/en/stable/parameter.ht
 The defaults in the script have been found to be generally
 good when dealing with a variety of SAQR datasets.
 Many of these hyperparameters can be set either by command line options, or as
-a textproto specifying values for the [proto](src/./xgboost/xgboost_model.proto)
+a textproto file specifying values for the [proto](/src//xgboost/xgboost_model.proto)
 and providing that file to the script via the --proto option.
 
 ## Scoring
@@ -92,7 +92,7 @@ used during training and will call make_descriptors in order to score the
 test set.
 
 Note too that here the underlying script is ruby, which calls a LillyMol
-c++ executable for scoring.
+C++ executable for scoring.
 
 One slightly unfortunate usability feature is that xgbd_make is a python script
 that uses absl for argument parsing, so -- type options are recognised. The
@@ -108,7 +108,7 @@ generate_smiles ... | xgbd_evaluate.sh -mdir MODEL -smi -
 
 although this only works well for a single descriptor computation. There are
 options in some tools to allow pipelined evaluation which does enable
-fully pipelined scoring.
+fully pipelined scoring of arbitrarily large sets of molecules.
 
 Model performance can be evaluated via
 ```
@@ -117,3 +117,18 @@ iwstats -E test.activity -p 2 test.pred
 
 or using any tool that can assess model performance.
 
+## Random Forest
+Very similar to xgbd_make.sh and xgbd_evaluate.sh the scripts rf_make.sh
+and rf_evaluate.sh build and evaluate random forest models. The options
+and functionality are very similar to the xgboost equivalents. Hyperparameters
+can be specified via a [random_forest_model](/src/xgboost/random_forest_model.proto]
+textproto.
+
+Generally we find that Random Forest models are slower to build, slower to score
+and usually less accurate than xgboost models.
+
+## Other Model Types
+Given the framework established with XGBoost and Random Forest models, it
+is relatively straightforward to enable other model building algorithms.
+Having a consistent interface makes integration with distributed evaluation
+and processing straightforward.