diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 39bf667031c1..925a15631fd1 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -100,6 +100,9 @@ class XGBModel(XGBModelBase): missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. + importance_type: string, default "gain" + The feature importance type for the feature_importances_ property: either "gain", + "weight", "cover", "total_gain" or "total_cover". \*\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. @@ -133,7 +136,8 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, - base_score=0.5, random_state=0, seed=None, missing=None, **kwargs): + base_score=0.5, random_state=0, seed=None, missing=None, + importance_type="gain", **kwargs): if not SKLEARN_INSTALLED: raise XGBoostError('sklearn needs to be installed in order to use this module') self.max_depth = max_depth @@ -159,6 +163,7 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, self.random_state = random_state self.nthread = nthread self.n_jobs = n_jobs + self.importance_type = importance_type def __setstate__(self, state): # backward compatibility code @@ -517,8 +522,8 @@ def feature_importances_(self): raise AttributeError('Feature importance is not defined for Booster type {}' .format(self.booster)) b = self.get_booster() - fs = b.get_fscore() - all_features = [fs.get(f, 0.) for f in b.feature_names] + score = b.get_score(importance_type=self.importance_type) + all_features = [score.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) return all_features / all_features.sum() diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 38e19922a86c..56243511d2d6 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -104,14 +104,14 @@ def test_ranking(): np.testing.assert_almost_equal(pred, pred_orig) -def test_feature_importances(): +def test_feature_importances_weight(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits digits = load_digits(2) y = digits['target'] X = digits['data'] - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., @@ -127,10 +127,39 @@ def test_feature_importances(): import pandas as pd y = pd.Series(digits['target']) X = pd.DataFrame(digits['data']) - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + +def test_feature_importances_gain(): + tm._skip_if_no_sklearn() + from sklearn.datasets import load_digits + + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) + + exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00326159, 0., 0., 0., + 0., 0., 0., 0., 0., 0.00297238, 0.00988034, 0., 0., 0., 0., + 0., 0., 0.03512521, 0.41123885, 0., 0., 0., 0., 0.01326332, + 0.00160674, 0., 0.4206952, 0., 0., 0., 0., 0.00616747, 0.01237546, + 0., 0., 0., 0., 0., 0., 0., 0.08240705, 0., 0., 0., 0., + 0., 0., 0., 0.00100649, 0., 0., 0., 0., 0.], dtype=np.float32) + + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + # numeric columns + import pandas as pd + y = pd.Series(digits['target']) + X = pd.DataFrame(digits['data']) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)