Skip to content

Commit

Permalink
[R-package] enable use of trees with linear models at leaves (fixes m…
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb committed Dec 31, 2020
1 parent 967b45c commit 89474ed
Show file tree
Hide file tree
Showing 12 changed files with 314 additions and 13 deletions.
6 changes: 6 additions & 0 deletions R-package/configure
Original file line number Diff line number Diff line change
Expand Up @@ -1699,6 +1699,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
6 changes: 6 additions & 0 deletions R-package/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
6 changes: 6 additions & 0 deletions R-package/configure.win
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ CC=`"${R_EXE}" CMD config CC`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
1 change: 1 addition & 0 deletions R-package/src/Makevars.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ PKGROOT=.

LGB_CPPFLAGS = \
@LGB_CPPFLAGS@ \
-Wno-unknown-pragmas \
-DUSE_SOCKET \
-DLGB_R_BUILD

Expand Down
1 change: 1 addition & 0 deletions R-package/src/Makevars.win.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ LGB_CPPFLAGS = \
-DLGB_R_BUILD

PKG_CPPFLAGS = \
-I$(PKGROOT) \
-I$(PKGROOT)/include \
$(LGB_CPPFLAGS)

Expand Down
240 changes: 240 additions & 0 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -1619,6 +1619,246 @@ test_that("early stopping works with lgb.cv()", {
)
})

context("linear learner")

test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {
set.seed(708L)
.new_dataset <- function() {
X <- matrix(rnorm(100L), ncol = 1L)
return(lgb.Dataset(
data = X
, label = 2 * X + runif(nrow(X), 0, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

test_that("lgb.train() w/ linear learner fails already-constructed dataset with linear=false", {
set.seed(708L)
params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- lgb.Dataset(
data = matrix(rnorm(100L), ncol = 1L)
, label = rnorm(100L)
)
dtrain$construct()
expect_error({
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
}, regexp = "Cannot change linear_tree after constructed Dataset handle")
})

test_that("lgb.train() works with linear learners even if Dataset has missing values", {
set.seed(708L)
.new_dataset <- function() {
values <- rnorm(100)
values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
X <- matrix(
data = sample(values, size = 100L)
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2 * X + runif(nrow(X), 0, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

test_that("lgb.train() works with linear learners, bagging, and a Dataset that has missing values", {
set.seed(708L)
.new_dataset <- function() {
values <- rnorm(100)
values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
X <- matrix(
data = sample(values, size = 100L)
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2 * X + runif(nrow(X), 0, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
, bagging_freq = 1
, subsample = 0.8
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

test_that("lgb.train() works with linear learners and data where a feature has only 1 non-NA value", {
set.seed(708L)
.new_dataset <- function() {
values <- rep(NA_real_, 100L)
values[18L] <- rnorm(1L)
X <- matrix(
data = values
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))
})

test_that("lgb.train() works with linear learners when Dataset has categorical features", {
set.seed(708L)
.new_dataset <- function() {
X <- cbind(
matrix(rnorm(100L), ncol = 1L)
, matrix(sample(seq_len(4L), size = 100L, replace = TRUE), ncol = 1L)
)
return(lgb.Dataset(
data = X
, label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
, categorical_featurs = 1L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

context("interaction constraints")

test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {
Expand Down
Binary file added R-package/tests/testthat/testthat-problems.rds
Binary file not shown.
45 changes: 45 additions & 0 deletions build-cran-package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,51 @@ cp \
external_libs/fmt/include/fmt/*.h \
${TEMP_R_DIR}/src/include/LightGBM/fmt/

# including only specific files from Eigen, to keep the R package
# small and avoid redistributing code with licenses incompatible with
# LightGBM's license
EIGEN_R_DIR=${TEMP_R_DIR}/src/include/Eigen
mkdir -p ${EIGEN_R_DIR}

modules="Cholesky Core Dense Eigenvalues Geometry Householder Jacobi LU QR SVD"
for eigen_module in ${modules}; do
cp eigen/Eigen/${eigen_module} ${EIGEN_R_DIR}/${eigen_module}
if [[ ${eigen_module} != "Dense" ]]; then
mkdir -p ${EIGEN_R_DIR}/src/${eigen_module}/
cp -R eigen/Eigen/src/${eigen_module}/* ${EIGEN_R_DIR}/src/${eigen_module}/
fi
done

mkdir -p ${EIGEN_R_DIR}/src/misc
cp -R eigen/Eigen/src/misc/* ${EIGEN_R_DIR}/src/misc/

mkdir -p ${EIGEN_R_DIR}/src/plugins
cp -R eigen/Eigen/src/plugins/* ${EIGEN_R_DIR}/src/plugins/

# include compile/eigen/CMakeLists.txt
# include compile/eigen/Eigen/CMakeLists.txt
# include compile/eigen/Eigen/Cholesky
# include compile/eigen/Eigen/Core
# include compile/eigen/Eigen/Dense
# include compile/eigen/Eigen/Eigenvalues
# include compile/eigen/Eigen/Geometry
# include compile/eigen/Eigen/Householder
# include compile/eigen/Eigen/Jacobi
# include compile/eigen/Eigen/LU
# include compile/eigen/Eigen/QR
# include compile/eigen/Eigen/SVD
# recursive-include compile/eigen/Eigen/src/Cholesky *
# recursive-include compile/eigen/Eigen/src/Core *
# recursive-include compile/eigen/Eigen/src/Eigenvalues *
# recursive-include compile/eigen/Eigen/src/Geometry *
# recursive-include compile/eigen/Eigen/src/Householder *
# recursive-include compile/eigen/Eigen/src/Jacobi *
# recursive-include compile/eigen/Eigen/src/LU *
# recursive-include compile/eigen/Eigen/src/misc *
# recursive-include compile/eigen/Eigen/src/plugins *
# recursive-include compile/eigen/Eigen/src/QR *
# recursive-include compile/eigen/Eigen/src/SVD *

cd ${TEMP_R_DIR}

# Remove files not needed for CRAN
Expand Down
8 changes: 8 additions & 0 deletions build_r.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,14 @@ if (USING_GPU) {
.handle_result(result)
}

result <- file.copy(
from = "eigen/"
, to = file.path(sprintf("%s/", TEMP_SOURCE_DIR), "include")
, recursive = TRUE
, overwrite = TRUE
)
.handle_result(result)

result <- file.copy(
from = "CMakeLists.txt"
, to = file.path(TEMP_R_DIR, "inst", "bin/")
Expand Down
1 change: 0 additions & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ struct Config {
// descl2 = missing values must be encoded as ``np.nan`` (Python) or ``NA`` (CLI), not ``0``
// descl2 = it is recommended to rescale data before training so that features have similar mean and standard deviation
// descl2 = **Note**: only works with CPU and ``serial`` tree learner
// descl2 = **Note**: not yet supported in R-package
// descl2 = **Note**: ``regression_l1`` objective is not supported with linear tree boosting
// descl2 = **Note**: setting ``linear_tree=true`` significantly increases the memory use of LightGBM
bool linear_tree = false;
Expand Down
Loading

0 comments on commit 89474ed

Please sign in to comment.