From f9c087958f271bcd14a2bc5c13398475803adf64 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 29 Jul 2020 15:16:07 +0800 Subject: [PATCH 1/3] Check ge. --- demo/regression/machine.conf | 20 +++++++++----------- src/learner.cc | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/demo/regression/machine.conf b/demo/regression/machine.conf index 14fc0f94ef10..4ba8437d5ae0 100644 --- a/demo/regression/machine.conf +++ b/demo/regression/machine.conf @@ -7,24 +7,22 @@ objective = reg:squarederror # Tree Booster Parameters # step size shrinkage -eta = 1.0 +eta = 1.0 # minimum loss reduction required to make a further partition -gamma = 1.0 +gamma = 1.0 # minimum sum of instance weight(hessian) needed in a child -min_child_weight = 1 +min_child_weight = 1 # maximum depth of a tree -max_depth = 3 +max_depth = 3 # Task parameters # the number of round to do boosting num_round = 2 # 0 means do not save any model except the final round model -save_period = 0 +save_period = 0 # The path of training data -data = "machine.txt.train" +data = "machine.txt.train" # The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "machine.txt.test" -# The path of test data -test:data = "machine.txt.test" - - +eval[test] = "machine.txt.test" +# The path of test data +test:data = "machine.txt.test" diff --git a/src/learner.cc b/src/learner.cc index b5151a13ae70..9aa7a05586f3 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1092,7 +1092,7 @@ class LearnerImpl : public LearnerIO { tparam_.dsplit == DataSplitMode::kAuto; }; if (row_based_split()) { - CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) + CHECK_GE(learner_model_param_.num_feature, p_fmat->Info().num_col_) << "Number of columns does not match number of features in booster."; } } From c8dd5af32e41fce4943ff3bb205ead132d440ea2 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 29 Jul 2020 15:39:38 +0800 Subject: [PATCH 2/3] Add tests. --- tests/cpp/predictor/test_cpu_predictor.cc | 4 +++ tests/cpp/predictor/test_gpu_predictor.cu | 3 ++ tests/cpp/predictor/test_predictor.cc | 40 +++++++++++++++++++++++ tests/cpp/predictor/test_predictor.h | 2 ++ tests/python/test_cli.py | 11 +++---- tests/python/test_demos.py | 15 +++++++++ tests/python/testing.py | 5 +++ 7 files changed, 74 insertions(+), 6 deletions(-) diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index cc34b1399ab9..ea8114579494 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -165,4 +165,8 @@ TEST(CpuPredictor, InplacePredict) { TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1); } } + +TEST(CpuPredictor, LesserFeatures) { + TestPredictionWithLesserFeatures("cpu_predictor"); +} } // namespace xgboost diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index f6fd3d7f0d53..b14b731142c2 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -160,5 +160,8 @@ TEST(GPUPredictor, MGPU_InplacePredict) { // NOLINT dmlc::Error); } +TEST(GpuPredictor, LesserFeatures) { + TestPredictionWithLesserFeatures("gpu_predictor"); +} } // namespace predictor } // namespace xgboost diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index 7cff5095711b..3005d585f5a6 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -134,4 +134,44 @@ void TestInplacePrediction(dmlc::any x, std::string predictor, learner->SetParam("gpu_id", "-1"); learner->Configure(); } + +void TestPredictionWithLesserFeatures(std::string predictor_name) { + size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4; + auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true); + auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false); + std::unique_ptr learner{Learner::Create({m_train})}; + + for (size_t i = 0; i < kIters; ++i) { + learner->UpdateOneIter(i, m_train); + } + + HostDeviceVector prediction; + learner->SetParam("predictor", predictor_name); + learner->Configure(); + Json config{Object()}; + learner->SaveConfig(&config); + ASSERT_EQ(get(config["learner"]["gradient_booster"]["gbtree_train_param"]["predictor"]), predictor_name); + + learner->Predict(m_test, false, &prediction); + ASSERT_EQ(prediction.Size(), kRows); + + auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false); + ASSERT_THROW({learner->Predict(m_invalid, false, &prediction);}, dmlc::Error); + +#if defined(XGBOOST_USE_CUDA) + HostDeviceVector from_cpu; + learner->SetParam("predictor", "cpu_predictor"); + learner->Predict(m_test, false, &from_cpu); + + HostDeviceVector from_cuda; + learner->SetParam("predictor", "gpu_predictor"); + learner->Predict(m_test, false, &from_cuda); + + auto const& h_cpu = from_cpu.ConstHostVector(); + auto const& h_gpu = from_cuda.ConstHostVector(); + for (size_t i = 0; i < h_cpu.size(); ++i) { + ASSERT_NEAR(h_cpu[i], h_gpu[i], kRtEps); + } +#endif // defined(XGBOOST_USE_CUDA) +} } // namespace xgboost diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h index 43c9c950d585..b6a3180111f2 100644 --- a/tests/cpp/predictor/test_predictor.h +++ b/tests/cpp/predictor/test_predictor.h @@ -59,6 +59,8 @@ void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method, void TestInplacePrediction(dmlc::any x, std::string predictor, bst_row_t rows, bst_feature_t cols, int32_t device = -1); + +void TestPredictionWithLesserFeatures(std::string preditor_name); } // namespace xgboost #endif // XGBOOST_TEST_PREDICTOR_H_ diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index f953840b1950..3ff37ea521c8 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -6,6 +6,7 @@ import subprocess import numpy import json +import testing as tm class TestCLI(unittest.TestCase): @@ -28,22 +29,20 @@ class TestCLI(unittest.TestCase): eval[test] = {data_path} ''' - curdir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) - project_root = os.path.normpath( - os.path.join(curdir, os.path.pardir, os.path.pardir)) + PROJECT_ROOT = tm.PROJECT_ROOT def get_exe(self): if platform.system() == 'Windows': exe = 'xgboost.exe' else: exe = 'xgboost' - exe = os.path.join(self.project_root, exe) + exe = os.path.join(self.PROJECT_ROOT, exe) assert os.path.exists(exe) return exe def test_cli_model(self): data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( - root=self.project_root) + root=self.PROJECT_ROOT) exe = self.get_exe() seed = 1994 @@ -128,7 +127,7 @@ def test_cli_help(self): def test_cli_model_json(self): exe = self.get_exe() data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( - root=self.project_root) + root=self.PROJECT_ROOT) seed = 1994 with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 99f3681392de..8b6535dbff45 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -117,3 +117,18 @@ def test_aft_demo(): # gamma regression is not tested as it requires running a R script first. # aft viz is not tested due to ploting is not controled # aft tunning is not tested due to extra dependency. + + +def test_cli_regression_demo(): + reg_dir = os.path.join(DEMO_DIR, 'regression') + script = os.path.join(reg_dir, 'mapfeat.py') + cmd = ['python', script] + subprocess.check_call(cmd, cwd=reg_dir) + + script = os.path.join(reg_dir, 'mknfold.py') + cmd = ['python', script, 'machine.txt', '1'] + subprocess.check_call(cmd, cwd=reg_dir) + + exe = os.path.join(tm.PROJECT_ROOT, 'xgboost') + conf = os.path.join(reg_dir, 'machine.conf') + subprocess.check_call([exe, conf], cwd=reg_dir) diff --git a/tests/python/testing.py b/tests/python/testing.py index 7ae202f01df5..c3f78f78e966 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -216,3 +216,8 @@ def _dataset_and_weight(draw): def non_increasing(L, tolerance=1e-4): return all((y - x) < tolerance for x, y in zip(L, L[1:])) + + +CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) +PROJECT_ROOT = os.path.normpath( + os.path.join(CURDIR, os.path.pardir, os.path.pardir)) From 1eda3ef99f7fbe511f2b2deca73f73fef22d9dd6 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 29 Jul 2020 18:03:56 +0800 Subject: [PATCH 3/3] Restrict it. --- jvm-packages/.gitignore | 1 - .../spark/FeatureSizeValidatingSuite.scala | 9 +++++--- src/learner.cc | 21 ++++++++++++------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/jvm-packages/.gitignore b/jvm-packages/.gitignore index becd9e300ce9..6d3f7b7cbf4b 100644 --- a/jvm-packages/.gitignore +++ b/jvm-packages/.gitignore @@ -1,3 +1,2 @@ tracker.py build.sh - diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala index f82927234490..0bbfb9f789a7 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala @@ -21,19 +21,23 @@ import org.apache.spark.Partitioner import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite +import org.apache.spark.sql.functions._ import scala.util.Random class FeatureSizeValidatingSuite extends FunSuite with PerTest { - test("transform throwing exception if feature size of dataset is different with model's") { + test("transform throwing exception if feature size of dataset is greater than model's") { val modelPath = getClass.getResource("/model/0.82/model").getPath val model = XGBoostClassificationModel.read.load(modelPath) val r = new Random(0) // 0.82/model was trained with 251 features. and transform will throw exception // if feature size of data is not equal to 251 - val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))). + var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))). toDF("feature", "label") + for (x <- 1 to 252) { + df = df.withColumn(s"feature_${x}", lit(1)) + } val assembler = new VectorAssembler() .setInputCols(df.columns.filter(!_.contains("label"))) .setOutputCol("features") @@ -67,5 +71,4 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest { xgb.fit(repartitioned) } } - } diff --git a/src/learner.cc b/src/learner.cc index 9aa7a05586f3..928f5a88ffcc 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -946,7 +946,7 @@ class LearnerImpl : public LearnerIO { common::GlobalRandom().seed(generic_parameters_.seed * kRandSeedMagic + iter); } this->CheckDataSplitMode(); - this->ValidateDMatrix(train.get()); + this->ValidateDMatrix(train.get(), true); auto& predt = this->cache_.Cache(train, generic_parameters_.gpu_id); @@ -972,7 +972,7 @@ class LearnerImpl : public LearnerIO { common::GlobalRandom().seed(generic_parameters_.seed * kRandSeedMagic + iter); } this->CheckDataSplitMode(); - this->ValidateDMatrix(train.get()); + this->ValidateDMatrix(train.get(), true); this->cache_.Cache(train, generic_parameters_.gpu_id); gbm_->DoBoost(train.get(), in_gpair, &cache_.Entry(train.get())); @@ -994,7 +994,7 @@ class LearnerImpl : public LearnerIO { for (size_t i = 0; i < data_sets.size(); ++i) { std::shared_ptr m = data_sets[i]; auto &predt = this->cache_.Cache(m, generic_parameters_.gpu_id); - this->ValidateDMatrix(m.get()); + this->ValidateDMatrix(m.get(), false); this->PredictRaw(m.get(), &predt, false); auto &out = output_predictions_.Cache(m, generic_parameters_.gpu_id).predictions; @@ -1079,11 +1079,11 @@ class LearnerImpl : public LearnerIO { bool training, unsigned ntree_limit = 0) const { CHECK(gbm_ != nullptr) << "Predict must happen after Load or configuration"; - this->ValidateDMatrix(data); + this->ValidateDMatrix(data, false); gbm_->PredictBatch(data, out_preds, training, ntree_limit); } - void ValidateDMatrix(DMatrix* p_fmat) const { + void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const { MetaInfo const& info = p_fmat->Info(); info.Validate(generic_parameters_.gpu_id); @@ -1092,8 +1092,15 @@ class LearnerImpl : public LearnerIO { tparam_.dsplit == DataSplitMode::kAuto; }; if (row_based_split()) { - CHECK_GE(learner_model_param_.num_feature, p_fmat->Info().num_col_) - << "Number of columns does not match number of features in booster."; + if (is_training) { + CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) + << "Number of columns does not match number of features in " + "booster."; + } else { + CHECK_GE(learner_model_param_.num_feature, p_fmat->Info().num_col_) + << "Number of columns does not match number of features in " + "booster."; + } } }