microsoft · StrikerRUS · Jan 18, 2021 · Dec 31, 2020 · Dec 31, 2020 · Dec 31, 2020
@@ -1699,6 +1699,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

@@ -26,6 +26,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

@@ -12,6 +12,12 @@ CC=`"${R_EXE}" CMD config CC`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

@@ -345,6 +345,45 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
   expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)])
 })
 
+test_that("lgb.cv() fit on linearly-relatead data improves when using linear learners", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(rnorm(1000L), ncol = 1L)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  cv_bst <- lgb.cv(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , nfold = 5L
+  )
+  expect_is(cv_bst, "lgb.CVBooster")
+
+  dtrain <- .new_dataset()
+  cv_bst_linear <- lgb.cv(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , nfold = 5L
+  )
+  expect_is(cv_bst_linear, "lgb.CVBooster")
+
+  expect_true(cv_bst_linear$best_score < cv_bst$best_score)
+})
+
 context("lgb.train()")
 
 test_that("lgb.train() works as expected with multiple eval metrics", {
@@ -1631,6 +1670,246 @@ test_that("early stopping works with lgb.cv()", {
   )
 })
 
+context("linear learner")
+
+test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(rnorm(100L), ncol = 1L)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+test_that("lgb.train() w/ linear learner fails already-constructed dataset with linear=false", {
+  testthat::skip("Skipping this test because it causes issues for valgrind")
+  set.seed(708L)
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(100L), ncol = 1L)
+    , label = rnorm(100L)
+  )
+  dtrain$construct()
+  expect_error({
+    bst_linear <- lgb.train(
+      data = dtrain
+      , nrounds = 10L
+      , params = modifyList(params, list(linear_tree = TRUE))
+      , valids = list("train" = dtrain)
+    )
+  }, regexp = "Cannot change linear_tree after constructed Dataset handle")
+})
+
+test_that("lgb.train() works with linear learners even if Dataset has missing values", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rnorm(100L)
+    values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
+    X <- matrix(
+      data = sample(values, size = 100L)
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+test_that("lgb.train() works with linear learners, bagging, and a Dataset that has missing values", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rnorm(100L)
+    values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
+    X <- matrix(
+      data = sample(values, size = 100L)
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+    , bagging_freq = 1L
+    , subsample = 0.8
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+test_that("lgb.train() works with linear learners and data where a feature has only 1 non-NA value", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rep(NA_real_, 100L)
+    values[18L] <- rnorm(1L)
+    X <- matrix(
+      data = values
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+})
+
+test_that("lgb.train() works with linear learners when Dataset has categorical features", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(numeric(200L), nrow = 100L, ncol = 2L)
+    X[, 1L] <- rnorm(100L)
+    X[, 2L] <- sample(seq_len(4L), size = 100L, replace = TRUE)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+    , categorical_featurs = 1L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
 context("interaction constraints")
 
 test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {

@@ -37,6 +37,27 @@ cp \
     external_libs/fmt/include/fmt/*.h \
     ${TEMP_R_DIR}/src/include/LightGBM/fmt/
 
+# including only specific files from Eigen, to keep the R package
+# small and avoid redistributing code with licenses incompatible with
+# LightGBM's license
+EIGEN_R_DIR=${TEMP_R_DIR}/src/include/Eigen
+mkdir -p ${EIGEN_R_DIR}
+
+modules="Cholesky Core Dense Eigenvalues Geometry Householder Jacobi LU QR SVD"
+for eigen_module in ${modules}; do
+    cp eigen/Eigen/${eigen_module} ${EIGEN_R_DIR}/${eigen_module}
+    if [ ${eigen_module} != "Dense" ]; then
+        mkdir -p ${EIGEN_R_DIR}/src/${eigen_module}/
+        cp -R eigen/Eigen/src/${eigen_module}/* ${EIGEN_R_DIR}/src/${eigen_module}/
+    fi
+done
+
+mkdir -p ${EIGEN_R_DIR}/src/misc
+cp -R eigen/Eigen/src/misc/* ${EIGEN_R_DIR}/src/misc/
+
+mkdir -p ${EIGEN_R_DIR}/src/plugins
+cp -R eigen/Eigen/src/plugins/* ${EIGEN_R_DIR}/src/plugins/
+
 cd ${TEMP_R_DIR}
 
     # Remove files not needed for CRAN
@@ -69,6 +90,9 @@ cd ${TEMP_R_DIR}
     for file in $(find . -name '*.h' -o -name '*.hpp' -o -name '*.cpp'); do
       sed \
         -i.bak \
+        -e 's/^.*#pragma clang diagnostic.*$//' \
+        -e 's/^.*#pragma diag_suppress.*$//' \
+        -e 's/^.*#pragma GCC diagnostic.*$//' \
         -e 's/^.*#pragma region.*$//' \
         -e 's/^.*#pragma endregion.*$//' \
         -e 's/^.*#pragma warning.*$//' \