Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] bring pandas support to the sklearn wrapper back #904

Merged
merged 3 commits into from
Sep 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

import numpy as np
import warnings
try:
import pandas as pd
_IS_PANDAS_INSTALLED = True
except ImportError:
_IS_PANDAS_INSTALLED = False

from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase, LGBMDeprecated,
Expand Down Expand Up @@ -332,7 +337,7 @@ def fit(self, X, y,
categorical_feature : list of strings or int, or 'auto', optional (default="auto")
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify feature_name as well).
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
callbacks : list of callback functions or None, optional (default=None)
List of callback functions that are applied at each iteration.
Expand Down Expand Up @@ -407,8 +412,10 @@ def fit(self, X, y,
feval = None
params['metric'] = eval_metric

X, y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
_LGBMCheckConsistentLength(X, y, sample_weight)
if not _IS_PANDAS_INSTALLED or not isinstance(X, pd.DataFrame):
X, y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
_LGBMCheckConsistentLength(X, y, sample_weight)

self._n_features = X.shape[1]

def _construct_dataset(X, y, sample_weight, init_score, group, params):
Expand Down Expand Up @@ -482,7 +489,8 @@ def predict(self, X, raw_score=False, num_iteration=0):
"""
if self._n_features is None:
raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
if not _IS_PANDAS_INSTALLED or not isinstance(X, pd.DataFrame):
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
n_features = X.shape[1]
if self._n_features != n_features:
raise ValueError("Number of features of the model must "
Expand All @@ -508,7 +516,8 @@ def apply(self, X, num_iteration=0):
"""
if self._n_features is None:
raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
if not _IS_PANDAS_INSTALLED or not isinstance(X, pd.DataFrame):
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
n_features = X.shape[1]
if self._n_features != n_features:
raise ValueError("Number of features of the model must "
Expand Down Expand Up @@ -686,7 +695,8 @@ def predict_proba(self, X, raw_score=False, num_iteration=0):
"""
if self._n_features is None:
raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
if not _IS_PANDAS_INSTALLED or not isinstance(X, pd.DataFrame):
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
n_features = X.shape[1]
if self._n_features != n_features:
raise ValueError("Number of features of the model must "
Expand Down
3 changes: 1 addition & 2 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_rf(self):
self.assertLess(ret, 0.25)
self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)

def test_regreesion(self):
def test_regression(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
Expand Down Expand Up @@ -444,7 +444,6 @@ def test_pandas_categorical(self):
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A', 'B', 'C', 'D'])
pred3 = list(gbm3.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = list(gbm4.predict(X_test))
Expand Down
38 changes: 37 additions & 1 deletion tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
sklearn_at_least_019 = True
except ImportError:
sklearn_at_least_019 = False
try:
import pandas as pd
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False


def multi_error(y_true, y_pred):
Expand All @@ -40,7 +45,7 @@ def test_binary(self):
self.assertLess(ret, 0.15)
self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)

def test_regreesion(self):
def test_regression(self):
X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
Expand Down Expand Up @@ -194,3 +199,34 @@ def test_sklearn_integration(self):
check(name, estimator)
except SkipTest as message:
warnings.warn(message, SkipTestWarning)

@unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
def test_pandas_categorical(self):
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
pred0 = list(gbm0.predict(X_test))
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
pred1 = list(gbm1.predict(X_test))
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
pred2 = list(gbm2.predict(X_test))
gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = list(gbm3.predict(X_test))
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = list(gbm4.predict(X_test))
pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred_prob, pred4)