Skip to content

Commit

Permalink
[performance] Introduce Fast variants for SingleRow predictors.
Browse files Browse the repository at this point in the history
Although this already provides performance gains by itself for any
callers, two new functions were added to Java's SWIG interfaces to
exploit that AND the GetPrimitiveArrayCritical data fetches.
  • Loading branch information
AlbertoEAF committed May 23, 2020
1 parent 8719fc6 commit d24eabe
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 60 deletions.
96 changes: 87 additions & 9 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,14 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int num_iteration,
int64_t* out_len);

/*!
* \brief Release FastConfig object.
*
* \param fastConfig Handle to the FastConfig object acquired with a `*FastInit()` method.
* \return LIGHTGBM_C_EXPORT LGBM_FastConfigFree
*/
LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig);

/*!
* \brief Make prediction for a new dataset in CSR format.
* \note
Expand Down Expand Up @@ -778,6 +786,73 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
int64_t* out_len,
double* out_result);

/*!
* \brief Initialize and return a `FastConfigHandle` for use with `LGBM_BoosterPredictForCSRSingleRowFast`.
*
* Release the `FastConfig` by passing its handle to `LGBM_FastConfigFree` when no longer needed.
*
* \param handle Booster handle
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param ncol Number of columns
* \param parameter Other parameters for prediction, e.g. early stopping for prediction
* \param[out] out_fastConfig FastConfig object with which you can call `LGBM_BoosterPredictForMatSingleRowFast`
* \return 0 when it succeeds, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
const int data_type,
const int64_t num_col,
const char* parameter,
FastConfigHandle *out_fastConfig);

/*!
* \brief Faster variant of `LGBM_BoosterPredictForCSRSingleRow`.
*
* Score single rows after setup with `LGBM_BoosterPredictForCSRSingleRowFastInit`.
*
* By removing the setup steps from this call extra optimizations can be made like
* initializing the config only once, instead of once per call.
*
* \note
* Setting up #threads is only done once at `LGBM_BoosterPredictForCSRSingleRowFastInit`
* instead of at each prediction.
* If you use a different #threads in other calls, you need to start the setup process over,
* or that number of threads will be used for this calls as well.
*
* \note
* You should pre-allocate memory for ``out_result``:
* - for normal and raw score, its length is equal to ``num_class * num_data``;
* - for leaf index, its length is equal to ``num_class * num_data * num_iteration``;
* - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``.
*
* \param fastConfig_handle FastConfig object handle returned by `LGBM_BoosterPredictForCSRSingleRowFastInit`
* \param indptr Pointer to row headers
* \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
* \param indices Pointer to column indices
* \param data Pointer to the data space
* \param nindptr Number of rows in the matrix + 1
* \param nelem Number of nonzero elements in the matrix
* \param predict_type What should be predicted
* - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed);
* - ``C_API_PREDICT_RAW_SCORE``: raw score;
* - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
* - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
* \param num_iteration Number of iterations for prediction, <= 0 means no limit
* \param[out] out_len Length of output result
* \param[out] out_result Pointer to array with predictions
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int64_t nindptr,
int64_t nelem,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result);

/*!
* \brief Make prediction for a new dataset in CSC format.
* \note
Expand Down Expand Up @@ -891,14 +966,6 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
int64_t* out_len,
double* out_result);

/*!
* \brief Release FastConfig object.
*
* \param fastConfig Handle to the FastConfig object acquired with a `*FastInit()` method.
* \return LIGHTGBM_C_EXPORT LGBM_FastConfigFree
*/
LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig);

/*!
* \brief Initialize and return a `FastConfigHandle` for use with `LGBM_BoosterPredictForMatSingleRowFast`.
*
Expand All @@ -918,7 +985,18 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle h
FastConfigHandle *out_fastConfig);

/*!
* \brief Score a single row after setup with `LGBM_BoosterPredictForMatSingleRowFastInit`.
* \brief Faster variant of `LGBM_BoosterPredictForMatSingleRow`.
*
* Score a single row after setup with `LGBM_BoosterPredictForMatSingleRowFastInit`.
*
* By removing the setup steps from this call extra optimizations can be made like
* initializing the config only once, instead of once per call.
*
* \note
* Setting up #threads is only done once at `LGBM_BoosterPredictForMatSingleRowFastInit`
* instead of at each prediction.
* If you use a different #threads in other calls, you need to start the setup process over,
* or that number of threads will be used for this calls as well.
*
* \param fastConfig_handle FastConfig object handle returned by `LGBM_BoosterPredictForMatSingleRowFastInit`
* \param data Single-row array data (no other way than row-major form).
Expand Down
153 changes: 102 additions & 51 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,56 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
API_END();
}

/*!
* \brief Union to hold different int type values.
*
* Introduced with FastConfig to support multiple num_col types
* that show up in the rest of the C API prediction methods.
*/
union IntUnion {
int32_t int32;
int64_t int64;
};

/*!
* \brief Object to store resources meant for single-row Fast Predict methods.
*
* Meant to be used as a basic struct by the *Fast* predict methods only.
* It stores the configuration resources for reuse during prediction.
*
* Even the row function is stored. We score the instance at the same memory
* address all the time. One just replaces the feature values at that address
* and scores again with the *Fast* methods.
*/
struct FastConfig {
FastConfig(Booster *const booster_ptr,
const char *parameter,
const int data_type_,
const int32_t num_cols) : booster(booster_ptr), data_type(data_type_) {
ncol.int32 = num_cols;
config.Set(Config::Str2Map(parameter));
}

FastConfig(Booster *const booster_ptr,
const char *parameter,
const int data_type_,
const int64_t num_cols) : booster(booster_ptr), data_type(data_type_) {
ncol.int64 = num_cols;
config.Set(Config::Str2Map(parameter));
}

Booster* const booster;
Config config;
const int data_type;
IntUnion ncol;
};

int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
API_BEGIN();
delete reinterpret_cast<FastConfig*>(fastConfig);
API_END();
}

int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
int indptr_type,
Expand Down Expand Up @@ -1551,6 +1601,51 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
API_END();
}

int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
const int data_type,
const int64_t num_col,
const char* parameter,
FastConfigHandle *out_fastConfig) {
API_BEGIN();
if (num_col <= 0) {
Log::Fatal("The number of columns should be greater than zero.");
} else if (num_col >= INT32_MAX) {
Log::Fatal("The number of columns should be smaller than INT32_MAX.");
}

auto fastConfig_ptr = std::unique_ptr<FastConfig>(new FastConfig(
reinterpret_cast<Booster*>(handle),
parameter,
data_type,
num_col));

if (fastConfig_ptr->config.num_threads > 0) {
omp_set_num_threads(fastConfig_ptr->config.num_threads);
}

*out_fastConfig = fastConfig_ptr.release();
API_END();
}

int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int64_t nindptr,
int64_t nelem,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
API_BEGIN();
FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, fastConfig->data_type, nindptr, nelem);
fastConfig->booster->PredictSingleRow(num_iteration, predict_type, static_cast<int32_t>(fastConfig->ncol.int64),
get_row_fun, fastConfig->config, out_result, out_len);
API_END();
}


int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
Expand Down Expand Up @@ -1648,51 +1743,6 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
API_END();
}

/*!
* \brief Object to store resources meant for single-row Fast Predict methods.
*
* Meant to be used as a basic struct by the *Fast* predict methods only.
* It stores the configuration resources for reuse during prediction.
*
* Even the row function is stored. We score the instance at the same memory
* address all the time. One just replaces the feature values at that address
* and scores again with the *Fast* methods.
*/
struct FastConfig {
public:
FastConfig(Booster *const booster_ptr,
const char *parameter,
const int data_type,
const int32_t num_cols) : _booster(booster_ptr), _data_type(data_type), _ncol(num_cols) {
_config.Set(Config::Str2Map(parameter));
}

friend int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
const int data_type,
const int32_t ncol,
const char* parameter,
FastConfigHandle *out_fastConfig);

friend int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fast_config_handle,
const void* data,
const int predict_type,
const int num_iteration,
int64_t* out_len,
double* out_result);

private:
Booster* const _booster;
Config _config;
const int _data_type;
const int32_t _ncol;
};

int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
API_BEGIN();
delete reinterpret_cast<FastConfig*>(fastConfig);
API_END();
}

int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
const int data_type,
const int32_t ncol,
Expand All @@ -1705,8 +1755,8 @@ int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
data_type,
ncol));

if (fastConfig_ptr->_config.num_threads > 0) {
omp_set_num_threads(fastConfig_ptr->_config.num_threads);
if (fastConfig_ptr->config.num_threads > 0) {
omp_set_num_threads(fastConfig_ptr->config.num_threads);
}

*out_fastConfig = fastConfig_ptr.release();
Expand All @@ -1721,10 +1771,11 @@ int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fastConfig_handle,
double* out_result) {
API_BEGIN();
FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, fastConfig->_ncol, fastConfig->_data_type, 1); // Single row in row-major format.
fastConfig->_booster->PredictSingleRow(num_iteration, predict_type,
fastConfig->_ncol, get_row_fun, fastConfig->_config,
out_result, out_len);
// Single row in row-major format:
auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, fastConfig->ncol.int32, fastConfig->data_type, 1);
fastConfig->booster->PredictSingleRow(num_iteration, predict_type, fastConfig->ncol.int32,
get_row_fun, fastConfig->config,
out_result, out_len);
API_END();
}

Expand Down
70 changes: 70 additions & 0 deletions swig/lightgbmlib.i
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,32 @@
return ret;
}

/*! \brief Even faster variant of `LGBM_BoosterPredictForMatSingle`.
*
* Uses `LGBM_BoosterPredictForMatSingleRowFast` which is faster
* than `LGBM_BoosterPredictForMatSingleRow` and the trick of
* `LGBM_BoosterPredictForMatSingle` to capture the Java data array
* using `GetPrimitiveArrayCritical`, which can yield faster access
* to the array if the JVM passes the actual address to the C++ side
* instead of performing a copy.
*/
int LGBM_BoosterPredictForMatSingleRowFastCriticalSWIG(JNIEnv *jenv,
jdoubleArray data,
FastConfigHandle handle,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
double* data0 = (double*)jenv->GetPrimitiveArrayCritical(data, 0);

int ret = LGBM_BoosterPredictForMatSingleRowFast(handle, data0, predict_type,
num_iteration, out_len, out_result);

jenv->ReleasePrimitiveArrayCritical(data, data0, JNI_ABORT);

return ret;
}

int LGBM_BoosterPredictForCSRSingle(JNIEnv *jenv,
jintArray indices,
jdoubleArray values,
Expand Down Expand Up @@ -130,6 +156,50 @@
return ret;
}

/*! \brief Even faster variant of `LGBM_BoosterPredictForCSRSingle`.
*
* Uses `LGBM_BoosterPredictForCSRSingleRowFast` which is faster
* than `LGBM_BoosterPredictForMatSingleRow` and the trick of
* `LGBM_BoosterPredictForCSRSingle` to capture the Java data array
* using `GetPrimitiveArrayCritical`, which can yield faster access
* to the array if the JVM passes the actual address to the C++ side
* instead of performing a copy.
*/
int LGBM_BoosterPredictForCSRSingleRowFastCriticalSWIG(JNIEnv *jenv,
jintArray indices,
jdoubleArray values,
int numNonZeros,
FastConfigHandle handle,
int indptr_type,
//int data_type,
int64_t nelem,
//int64_t num_col,
int predict_type,
int num_iteration,
//const char* parameter,
int64_t* out_len,
double* out_result) {
// Alternatives
// - GetIntArrayElements: performs copy
// - GetDirectBufferAddress: fails on wrapped array
// Some words of warning for GetPrimitiveArrayCritical
// https://stackoverflow.com/questions/23258357/whats-the-trade-off-between-using-getprimitivearraycritical-and-getprimitivety

jboolean isCopy;
int* indices0 = (int*)jenv->GetPrimitiveArrayCritical(indices, &isCopy);
double* values0 = (double*)jenv->GetPrimitiveArrayCritical(values, &isCopy);

int32_t ind[2] = { 0, numNonZeros };

int ret = LGBM_BoosterPredictForCSRSingleRowFast(handle, ind, indptr_type, indices0, values0, 2,
nelem, predict_type, num_iteration, out_len, out_result);

jenv->ReleasePrimitiveArrayCritical(values, values0, JNI_ABORT);
jenv->ReleasePrimitiveArrayCritical(indices, indices0, JNI_ABORT);

return ret;
}

#include <functional>
#include <vector>

Expand Down

0 comments on commit d24eabe

Please sign in to comment.