Skip to content

Commit

Permalink
PUBDEV-8266 XGBoost alias rewriting (#5641)
Browse files Browse the repository at this point in the history
PUBDEV-8266 fix alias rewriting
  • Loading branch information
maurever committed Oct 4, 2021
1 parent e23293e commit 716859a
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 18 deletions.
16 changes: 16 additions & 0 deletions h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoost.java
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,13 @@ public class XGBoost extends ModelBuilder<XGBoostModel,XGBoostModel.XGBoostParam
checkPositiveRate("colsample_bylevel", _parms._colsample_bylevel);
checkPositiveRate("colsample_bynode", _parms._colsample_bynode);
checkPositiveRate("colsample_bytree", _parms._colsample_bytree);
checkColumnAlias("col_sample_rate", _parms._col_sample_rate, "colsample_bylevel", _parms._colsample_bylevel, 1);
checkColumnAlias("col_sample_rate_per_tree", _parms._col_sample_rate_per_tree, "colsample_bytree", _parms._colsample_bytree, 1);
checkColumnAlias("sample_rate", _parms._sample_rate, "subsample", _parms._subsample, 1);
checkColumnAlias("learn_rate", _parms._learn_rate, "subsample", _parms._eta, 0.3);
checkColumnAlias("max_abs_leafnode_pred", _parms._max_abs_leafnode_pred, "max_delta_step", _parms._max_delta_step,0);
checkColumnAlias("ntrees", _parms._ntrees, "n_estimators", _parms._n_estimators, 0);


if (_parms._scale_pos_weight != 1) {
if (_nclass != 2)
Expand Down Expand Up @@ -268,6 +275,15 @@ private void checkPositiveRate(String paramName, double rateValue) {
error("_" + paramName, paramName + " must be between 0 (exclusive) and 1 (inclusive)");
}

private void checkColumnAlias(String paramName, double paramValue, String aliasName, double aliasValue, double defaultValue) {
if (paramValue != defaultValue && aliasValue != defaultValue && paramValue != aliasValue) {
error("_" + paramName, paramName + " and its alias " + aliasName + " are both set to different value than default value. Set " + aliasName + " to default value (" + defaultValue + "), to use " + paramName + " actual value.");
}
if (aliasValue != defaultValue){
warn("_"+paramName, "Using user-provided parameter "+aliasName+" instead of "+paramName+".\"");
}
}

@Override
protected void checkEarlyStoppingReproducibility() {
if (_parms._score_tree_interval == 0 && !_parms._score_each_iteration) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,6 @@ public static Map<String, Object> createParamsMap(XGBoostParameters p, int nClas
p._n_estimators = p._ntrees;
}
if (p._eta != 0.3) {
LOG.info("Using user-provided parameter eta instead of learn_rate.");
params.put("eta", p._eta);
p._learn_rate = p._eta;
} else {
Expand All @@ -321,23 +320,20 @@ public static Map<String, Object> createParamsMap(XGBoostParameters p, int nClas
params.put("silent", p._quiet_mode);
}
if (p._subsample != 1.0) {
LOG.info("Using user-provided parameter subsample instead of sample_rate.");
params.put("subsample", p._subsample);
p._sample_rate = p._subsample;
} else {
params.put("subsample", p._sample_rate);
p._subsample = p._sample_rate;
}
if (p._colsample_bytree != 1.0) {
LOG.info("Using user-provided parameter colsample_bytree instead of col_sample_rate_per_tree.");
params.put("colsample_bytree", p._colsample_bytree);
p._col_sample_rate_per_tree = p._colsample_bytree;
} else {
params.put("colsample_bytree", p._col_sample_rate_per_tree);
p._colsample_bytree = p._col_sample_rate_per_tree;
}
if (p._colsample_bylevel != 1.0) {
LOG.info("Using user-provided parameter colsample_bylevel instead of col_sample_rate.");
params.put("colsample_bylevel", p._colsample_bylevel);
p._col_sample_rate = p._colsample_bylevel;
} else {
Expand All @@ -348,7 +344,6 @@ public static Map<String, Object> createParamsMap(XGBoostParameters p, int nClas
params.put("colsample_bynode", p._colsample_bynode);
}
if (p._max_delta_step != 0) {
LOG.info("Using user-provided parameter max_delta_step instead of max_abs_leafnode_pred.");
params.put("max_delta_step", p._max_delta_step);
p._max_abs_leafnode_pred = p._max_delta_step;
} else {
Expand Down
136 changes: 136 additions & 0 deletions h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2623,4 +2623,140 @@ public void testScalePosWeight() {
}
}

@Test
public void testColSampleRate() {
Scope.enter();
try {
XGBoostModel model1, model2;
Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv");

train.remove("Site").remove();
train.remove("Method").remove();
train.toCategoricalCol("Angaus");
Scope.track(train);

XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters();
parms._train = train._key;
parms._valid = train._key;
parms._response_column = "Angaus";
parms._distribution = multinomial;
parms._ntrees = 5;
parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist;
parms._seed = 42;
parms._col_sample_rate = 0.9;
model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get();
Scope.track_generic(model1);

XGBoostModel.XGBoostParameters parms2 = new XGBoostModel.XGBoostParameters();
parms2._train = train._key;
parms2._valid = train._key;
parms2._response_column = "Angaus";
parms2._distribution = multinomial;
parms2._ntrees = 5;
parms2._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist;
parms2._seed = 42;
parms2._col_sample_rate = 0.1;
model2 = new hex.tree.xgboost.XGBoost(parms2).trainModel().get();
Scope.track_generic(model2);
assertNotEquals(model1._output._training_metrics.rmse(), model2._output._training_metrics.rmse(), 0);
} finally {
Scope.exit();
}
}

@Test
public void testColSampleRateSameValue() {
Scope.enter();
try {
Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv");

train.remove("Site").remove();
train.remove("Method").remove();
train.toCategoricalCol("Angaus");
Scope.track(train);

XGBoostModel model1, model2;
XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters();
parms._train = train._key;
parms._valid = train._key;
parms._response_column = "Angaus";
parms._distribution = multinomial;
parms._ntrees = 5;
parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist;
parms._seed = 42;
parms._col_sample_rate = 0.9;
model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get();
Scope.track_generic(model1);

parms._colsample_bylevel = 0.9;
model2 = new hex.tree.xgboost.XGBoost(parms).trainModel().get();
Scope.track_generic(model2);
assertEquals(model1._output._training_metrics.rmse(), model2._output._training_metrics.rmse(), 0);
} finally {
Scope.exit();
}
}

@Test
public void testColSampleRateAndAlias() {
Scope.enter();
try {
XGBoostModel model1;
Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv");

// Fix training set
train.remove("Site").remove();
train.remove("Method").remove();
train.toCategoricalCol("Angaus");
Scope.track(train);

XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters();
parms._train = train._key;
parms._valid = train._key;
parms._response_column = "Angaus";
parms._distribution = multinomial;
parms._ntrees = 5;
parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist;
parms._seed = 42;
parms._col_sample_rate = 0.9;
parms._colsample_bylevel = 0.3;
model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get();
Scope.track_generic(model1);
fail("Model training should fail.");
} catch(H2OModelBuilderIllegalArgumentException ex){
assertTrue(ex.getMessage().contains("col_sample_rate and its alias colsample_bylevel are both set"));
} finally {
Scope.exit();
}
}

@Test
public void testColSampleRateAndAliasSame() {
Scope.enter();
try {
XGBoostModel model1;
Frame train = parseTestFile("smalldata/gbm_test/ecology_model.csv");

train.remove("Site").remove();
train.remove("Method").remove();
train.toCategoricalCol("Angaus");
Scope.track(train);

XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters();
parms._train = train._key;
parms._valid = train._key;
parms._response_column = "Angaus";
parms._distribution = multinomial;
parms._ntrees = 5;
parms._tree_method = XGBoostModel.XGBoostParameters.TreeMethod.hist;
parms._seed = 42;
parms._col_sample_rate = 0.9;
parms._colsample_bylevel = 0.9;
model1 = new hex.tree.xgboost.XGBoost(parms).trainModel().get();
Scope.track_generic(model1);
assertEquals(model1._parms._col_sample_rate, model1._parms._colsample_bylevel, 0);
} finally {
Scope.exit();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,12 @@ public void testSteamClusterStop() throws Exception {
expectAndCheckStopResponse(steam, "03_stop_req_02", true);

// building another model
params._ntrees = 5;
Job<XGBoostModel> model2 = new XGBoost(params).trainModel();
XGBoostModel.XGBoostParameters params2 = new XGBoostModel.XGBoostParameters();
params2._train = train._key;
params2._ntrees = 5;
params2._response_column = "AGE";
params2._ignored_columns = new String[]{"ID"};
Job<XGBoostModel> model2 = new XGBoost(params2).trainModel();

// will request external cluster start again
Map<String, String> startReq2 = steam.waitToReceiveMessage("start request");
Expand Down
27 changes: 16 additions & 11 deletions h2o-py/tests/testdir_algos/xgboost/pyunit_synonym_params_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
sys.path.insert(1,"../../../")
from tests import pyunit_utils
from h2o.estimators.xgboost import *
from h2o.exceptions import H2OResponseError


def xgboost_synonym_params():
Expand All @@ -24,23 +25,27 @@ def xgboost_synonym_params():
vals = a[2]
print("check parity of %s and %s via %s" % (p1, p2, vals))
# check default values end up to same value
model.train(x=x, y=y, training_frame=df )
model.train(x=x, y=y, training_frame=df)
assert model.parms[p1]['actual_value'] == model.parms[p2]['actual_value']
# changing p2 modifies both
# changing p2 and p1 is default - should not fail
setattr(model, p2, vals[0])
model.train(x=x, y=y, training_frame=df )
model.train(x=x, y=y, training_frame=df)
assert model.parms[p1]['actual_value'] == vals[0]
assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value']
# changing p1 modifies both
# changing p1 and p2 is not default - should fail
setattr(model, p1, vals[1])
model.train(x=x, y=y, training_frame=df )
assert model.parms[p1]['actual_value'] == vals[1]
assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value']
# changing p2 has no effect since p1 has precedence
try:
model.train(x=x, y=y, training_frame=df)
except H2OResponseError as e:
assert "ERRR on field: _"+p2 in str(e), p2+" and its alias "+p1+" are both set"
setattr(model, p2, vals[1])
# changing p2 since p1 has precedence and is not default - should fail
setattr(model, p2, vals[2])
model.train(x=x, y=y, training_frame=df)
assert model.parms[p1]['actual_value'] == vals[1]
assert model.parms[p1]['actual_value'], model.parms[p2]['actual_value']
try:
model.train(x=x, y=y, training_frame=df)
except H2OResponseError as e:
assert "ERRR on field: _"+p2 in str(e), p2+" and its alias "+p1+" are both set"
setattr(model, p2, vals[1])


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from h2o.estimators.xgboost import *
from tests import pyunit_utils
from h2o.exceptions import H2OResponseError


def test_param_and_alias_are_same(data, x_names, y):
assert H2OXGBoostEstimator.available() is True

num_round = 5
params = {
'tree_method': 'hist',
'ntrees': num_round,
'backend': 'cpu',
'save_matrix_directory': "/home/mori/Documents/h2o/code/test/xgboost_data/",
'seed': 42,
'colsample_bylevel': 0.9,
'col_sample_rate': 0.9
}

# train h2o XGBoost models
h2o_model = H2OXGBoostEstimator(**params)
h2o_model.train(x=x_names, y=y, training_frame=data)

assert h2o_model is not None, "Training should not fail."


def test_param_and_alias_are_not_same(data, x_names, y):
assert H2OXGBoostEstimator.available() is True

num_round = 5
params = {
'tree_method': 'hist',
'ntrees': num_round,
'backend': 'cpu',
'save_matrix_directory': "/home/mori/Documents/h2o/code/test/xgboost_data/",
'seed': 42,
'colsample_bylevel': 0.9,
'col_sample_rate': 0.3
}

# train h2o XGBoost models
h2o_model = H2OXGBoostEstimator(**params)
try:
h2o_model.train(x=x_names, y=y, training_frame=data)
assert False, "Training should fail."
except H2OResponseError as e:
assert "ERRR on field: _col_sample_rate" in str(e), \
"col_sample_rate and its alias colsample_bylevel are both set"


def test_alias():
data = h2o.import_file(path="../../../../smalldata/gbm_test/ecology_model.csv")
y = "Angaus"
data[y] = data[y].asfactor()
x_names = data.col_names.remove(y)
test_param_and_alias_are_same(data, x_names, y)
test_param_and_alias_are_not_same(data, x_names, y)


if __name__ == "__main__":
pyunit_utils.standalone_test(test_alias)
else:
test_alias()

0 comments on commit 716859a

Please sign in to comment.