diff --git a/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoost.java b/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoost.java index cc596e3d1af6..e8c507eb4110 100755 --- a/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoost.java +++ b/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoost.java @@ -231,6 +231,13 @@ public class XGBoost extends ModelBuilder createParamsMap(XGBoostParameters p, int nClas p._n_estimators = p._ntrees; } if (p._eta != 0.3) { - LOG.info("Using user-provided parameter eta instead of learn_rate."); params.put("eta", p._eta); p._learn_rate = p._eta; } else { @@ -321,7 +320,6 @@ public static Map createParamsMap(XGBoostParameters p, int nClas params.put("silent", p._quiet_mode); } if (p._subsample != 1.0) { - LOG.info("Using user-provided parameter subsample instead of sample_rate."); params.put("subsample", p._subsample); p._sample_rate = p._subsample; } else { @@ -329,7 +327,6 @@ public static Map createParamsMap(XGBoostParameters p, int nClas p._subsample = p._sample_rate; } if (p._colsample_bytree != 1.0) { - LOG.info("Using user-provided parameter colsample_bytree instead of col_sample_rate_per_tree."); params.put("colsample_bytree", p._colsample_bytree); p._col_sample_rate_per_tree = p._colsample_bytree; } else { @@ -337,7 +334,6 @@ public static Map createParamsMap(XGBoostParameters p, int nClas p._colsample_bytree = p._col_sample_rate_per_tree; } if (p._colsample_bylevel != 1.0) { - LOG.info("Using user-provided parameter colsample_bylevel instead of col_sample_rate."); params.put("colsample_bylevel", p._colsample_bylevel); p._col_sample_rate = p._colsample_bylevel; } else { @@ -348,7 +344,6 @@ public static Map createParamsMap(XGBoostParameters p, int nClas params.put("colsample_bynode", p._colsample_bynode); } if (p._max_delta_step != 0) { - LOG.info("Using user-provided parameter max_delta_step instead of max_abs_leafnode_pred."); params.put("max_delta_step", p._max_delta_step); p._max_abs_leafnode_pred = p._max_delta_step; } else { diff --git a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java index 26ba66459c3d..194ea8e04cd8 100755 --- a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java +++ b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java @@ -2623,4 +2623,140 @@ public void testScalePosWeight() { } } + @Test + public void testColSampleRate() { + Scope.enter(); + try { + XGBoostModel model1, model2; + Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv"); + + train.remove("Site").remove(); + train.remove("Method").remove(); + train.toCategoricalCol("Angaus"); + Scope.track(train); + + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._train = train._key; + parms._valid = train._key; + parms._response_column = "Angaus"; + parms._distribution = multinomial; + parms._ntrees = 5; + parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist; + parms._seed = 42; + parms._col_sample_rate = 0.9; + model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get(); + Scope.track_generic(model1); + + XGBoostModel.XGBoostParameters parms2 = new XGBoostModel.XGBoostParameters(); + parms2._train = train._key; + parms2._valid = train._key; + parms2._response_column = "Angaus"; + parms2._distribution = multinomial; + parms2._ntrees = 5; + parms2._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist; + parms2._seed = 42; + parms2._col_sample_rate = 0.1; + model2 = new hex.tree.xgboost.XGBoost(parms2).trainModel().get(); + Scope.track_generic(model2); + assertNotEquals(model1._output._training_metrics.rmse(), model2._output._training_metrics.rmse(), 0); + } finally { + Scope.exit(); + } + } + + @Test + public void testColSampleRateSameValue() { + Scope.enter(); + try { + Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv"); + + train.remove("Site").remove(); + train.remove("Method").remove(); + train.toCategoricalCol("Angaus"); + Scope.track(train); + + XGBoostModel model1, model2; + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._train = train._key; + parms._valid = train._key; + parms._response_column = "Angaus"; + parms._distribution = multinomial; + parms._ntrees = 5; + parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist; + parms._seed = 42; + parms._col_sample_rate = 0.9; + model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get(); + Scope.track_generic(model1); + + parms._colsample_bylevel = 0.9; + model2 = new hex.tree.xgboost.XGBoost(parms).trainModel().get(); + Scope.track_generic(model2); + assertEquals(model1._output._training_metrics.rmse(), model2._output._training_metrics.rmse(), 0); + } finally { + Scope.exit(); + } + } + + @Test + public void testColSampleRateAndAlias() { + Scope.enter(); + try { + XGBoostModel model1; + Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv"); + + // Fix training set + train.remove("Site").remove(); + train.remove("Method").remove(); + train.toCategoricalCol("Angaus"); + Scope.track(train); + + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._train = train._key; + parms._valid = train._key; + parms._response_column = "Angaus"; + parms._distribution = multinomial; + parms._ntrees = 5; + parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist; + parms._seed = 42; + parms._col_sample_rate = 0.9; + parms._colsample_bylevel = 0.3; + model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get(); + Scope.track_generic(model1); + fail("Model training should fail."); + } catch(H2OModelBuilderIllegalArgumentException ex){ + assertTrue(ex.getMessage().contains("col_sample_rate and its alias colsample_bylevel are both set")); + } finally { + Scope.exit(); + } + } + + @Test + public void testColSampleRateAndAliasSame() { + Scope.enter(); + try { + XGBoostModel model1; + Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv"); + + train.remove("Site").remove(); + train.remove("Method").remove(); + train.toCategoricalCol("Angaus"); + Scope.track(train); + + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._train = train._key; + parms._valid = train._key; + parms._response_column = "Angaus"; + parms._distribution = multinomial; + parms._ntrees = 5; + parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist; + parms._seed = 42; + parms._col_sample_rate = 0.9; + parms._colsample_bylevel = 0.9; + model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get(); + Scope.track_generic(model1); + assertEquals(model1._parms._col_sample_rate, model1._parms._colsample_bylevel, 0); + } finally { + Scope.exit(); + } + } } diff --git a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/remote/SteamExecutorStarterTest.java b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/remote/SteamExecutorStarterTest.java index c2f815380717..8ec157322f36 100644 --- a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/remote/SteamExecutorStarterTest.java +++ b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/remote/SteamExecutorStarterTest.java @@ -245,8 +245,12 @@ public void testSteamClusterStop() throws Exception { expectAndCheckStopResponse(steam, "03_stop_req_02", true); // building another model - params._ntrees = 5; - Job model2 = new XGBoost(params).trainModel(); + XGBoostModel.XGBoostParameters params2 = new XGBoostModel.XGBoostParameters(); + params2._train = train._key; + params2._ntrees = 5; + params2._response_column = "AGE"; + params2._ignored_columns = new String[]{"ID"}; + Job model2 = new XGBoost(params2).trainModel(); // will request external cluster start again Map startReq2 = steam.waitToReceiveMessage("start request"); diff --git a/h2o-py/tests/testdir_algos/xgboost/pyunit_synonym_params_xgboost.py b/h2o-py/tests/testdir_algos/xgboost/pyunit_synonym_params_xgboost.py index 48f9e9cbe106..919cd1dcf629 100644 --- a/h2o-py/tests/testdir_algos/xgboost/pyunit_synonym_params_xgboost.py +++ b/h2o-py/tests/testdir_algos/xgboost/pyunit_synonym_params_xgboost.py @@ -3,6 +3,7 @@ sys.path.insert(1,"../../../") from tests import pyunit_utils from h2o.estimators.xgboost import * +from h2o.exceptions import H2OResponseError def xgboost_synonym_params(): @@ -24,23 +25,27 @@ def xgboost_synonym_params(): vals = a[2] print("check parity of %s and %s via %s" % (p1, p2, vals)) # check default values end up to same value - model.train(x=x, y=y, training_frame=df ) + model.train(x=x, y=y, training_frame=df) assert model.parms[p1]['actual_value'] == model.parms[p2]['actual_value'] - # changing p2 modifies both + # changing p2 and p1 is default - should not fail setattr(model, p2, vals[0]) - model.train(x=x, y=y, training_frame=df ) + model.train(x=x, y=y, training_frame=df) assert model.parms[p1]['actual_value'] == vals[0] assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value'] - # changing p1 modifies both + # changing p1 and p2 is not default - should fail setattr(model, p1, vals[1]) - model.train(x=x, y=y, training_frame=df ) - assert model.parms[p1]['actual_value'] == vals[1] - assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value'] - # changing p2 has no effect since p1 has precedence + try: + model.train(x=x, y=y, training_frame=df) + except H2OResponseError as e: + assert "ERRR on field: _"+p2 in str(e), p2+" and its alias "+p1+" are both set" + setattr(model, p2, vals[1]) + # changing p2 since p1 has precedence and is not default - should fail setattr(model, p2, vals[2]) - model.train(x=x, y=y, training_frame=df) - assert model.parms[p1]['actual_value'] == vals[1] - assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value'] + try: + model.train(x=x, y=y, training_frame=df) + except H2OResponseError as e: + assert "ERRR on field: _"+p2 in str(e), p2+" and its alias "+p1+" are both set" + setattr(model, p2, vals[1]) if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_algos/xgboost/pyunit_xgboost_colsample_bylevel.py b/h2o-py/tests/testdir_algos/xgboost/pyunit_xgboost_colsample_bylevel.py new file mode 100644 index 000000000000..d0b08d32e3da --- /dev/null +++ b/h2o-py/tests/testdir_algos/xgboost/pyunit_xgboost_colsample_bylevel.py @@ -0,0 +1,63 @@ +from h2o.estimators.xgboost import * +from tests import pyunit_utils +from h2o.exceptions import H2OResponseError + + +def test_param_and_alias_are_same(data, x_names, y): + assert H2OXGBoostEstimator.available() is True + + num_round = 5 + params = { + 'tree_method': 'hist', + 'ntrees': num_round, + 'backend': 'cpu', + 'save_matrix_directory': "/home/mori/Documents/h2o/code/test/xgboost_data/", + 'seed': 42, + 'colsample_bylevel': 0.9, + 'col_sample_rate': 0.9 + } + + # train h2o XGBoost models + h2o_model = H2OXGBoostEstimator(**params) + h2o_model.train(x=x_names, y=y, training_frame=data) + + assert h2o_model is not None, "Training should not fail." + + +def test_param_and_alias_are_not_same(data, x_names, y): + assert H2OXGBoostEstimator.available() is True + + num_round = 5 + params = { + 'tree_method': 'hist', + 'ntrees': num_round, + 'backend': 'cpu', + 'save_matrix_directory': "/home/mori/Documents/h2o/code/test/xgboost_data/", + 'seed': 42, + 'colsample_bylevel': 0.9, + 'col_sample_rate': 0.3 + } + + # train h2o XGBoost models + h2o_model = H2OXGBoostEstimator(**params) + try: + h2o_model.train(x=x_names, y=y, training_frame=data) + assert False, "Training should fail." + except H2OResponseError as e: + assert "ERRR on field: _col_sample_rate" in str(e), \ + "col_sample_rate and its alias colsample_bylevel are both set" + + +def test_alias(): + data = h2o.import_file(path="../../../../smalldata/gbm_test/ecology_model.csv") + y = "Angaus" + data[y] = data[y].asfactor() + x_names = data.col_names.remove(y) + test_param_and_alias_are_same(data, x_names, y) + test_param_and_alias_are_not_same(data, x_names, y) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_alias) +else: + test_alias()