From d387d6c03f9342e58547b671f12008353ba30096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20MZ?= Date: Wed, 31 Mar 2021 21:37:51 -0600 Subject: [PATCH 01/13] initial implementation of init_score for multiclass classification --- python-package/lightgbm/basic.py | 46 ++++++++++++++++++++++--- tests/python_package_test/test_basic.py | 30 +++++++++++++++- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 0bdf5057b833..879cbe3add87 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -134,8 +134,8 @@ def is_numpy_column_array(data): return len(shape) == 2 and shape[1] == 1 -def cast_numpy_1d_array_to_dtype(array, dtype): - """Cast numpy 1d array to given dtype.""" +def cast_numpy_array_to_dtype(array, dtype): + """Cast numpy array to given dtype.""" if array.dtype == dtype: return array return array.astype(dtype=dtype, copy=False) @@ -149,11 +149,11 @@ def is_1d_list(data): def list_to_1d_numpy(data, dtype=np.float32, name='list'): """Convert data to numpy 1-D array.""" if is_numpy_1d_array(data): - return cast_numpy_1d_array_to_dtype(data, dtype) + return cast_numpy_array_to_dtype(data, dtype) elif is_numpy_column_array(data): _log_warning('Converting column-vector to 1d array') array = data.ravel() - return cast_numpy_1d_array_to_dtype(array, dtype) + return cast_numpy_array_to_dtype(array, dtype) elif is_1d_list(data): return np.array(data, dtype=dtype, copy=False) elif isinstance(data, pd_Series): @@ -165,6 +165,28 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name)) +def is_numpy_2d_array(data): + return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1 + + +def is_2d_list(data): + return isinstance(data, list) and is_1d_list(data[0]) + + +def data_to_2d_numpy(data, dtype=np.float32, name='list'): + """Convert data to numpy 2-D array.""" + if is_numpy_2d_array(data): + return cast_numpy_array_to_dtype(data, dtype) + if is_2d_list(data): + return np.array(data, dtype=dtype) + if isinstance(data, pd_DataFrame): + if _get_bad_pandas_dtypes(data.dtypes): + raise ValueError('Series.dtypes must be int, float or bool') + return cast_numpy_array_to_dtype(data.values, dtype) + raise TypeError("Wrong type({0}) for {1}.\n" + "It should be list, numpy 2-D array or pandas DataFrame".format(type(data).__name__, name)) + + def cfloat32_array_to_numpy(cptr, length): """Convert a ctypes float pointer array to a numpy array.""" if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): @@ -1840,7 +1862,21 @@ def set_init_score(self, init_score): """ self.init_score = init_score if self.handle is not None and init_score is not None: - init_score = list_to_1d_numpy(init_score, np.float64, name='init_score') + try: + init_score = list_to_1d_numpy(init_score, np.float64, name='init_score') + except TypeError as err: + if self.params.get('num_class', 0) > 1: + init_score = data_to_2d_numpy(init_score, np.float64, name='init_score') + expected_samples, expected_classes = self.num_data(), self.params['num_class'] + n_samples, n_classes = init_score.shape + if n_samples != expected_samples or n_classes != expected_classes: + raise ValueError( + f'Expected init_score to be of shape ({expected_samples}, {expected_classes}). ' + f'Got ({n_samples}, {n_classes}).') + init_score = init_score.ravel() + self.init_score = init_score + else: + raise err self.set_field('init_score', init_score) self.init_score = self.get_field('init_score') # original values can be modified at cpp side return self diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 92c75d8879ee..f8e46e2f7ede 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -1,5 +1,6 @@ # coding: utf-8 import os +from lightgbm.basic import LightGBMError import numpy as np import pytest @@ -8,7 +9,7 @@ from sklearn.model_selection import train_test_split import lightgbm as lgb -from lightgbm.compat import PANDAS_INSTALLED, pd_Series +from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series from .utils import load_breast_cancer @@ -406,3 +407,30 @@ def test_list_to_1d_numpy(y, dtype): result = lgb.basic.list_to_1d_numpy(y, dtype=dtype) assert result.size == 10 assert result.dtype == dtype + + +@pytest.mark.parametrize( + 'init_score', + [ + np.random.rand(10, 3), + np.random.rand(10, 2), + pd_DataFrame(np.random.rand(10, 3)), + pd_DataFrame(np.random.rand(10, 2)), + [[1, 1, 1] for _ in range(10)], + [[1, 1] for _ in range(10)], + ] +) +def test_init_score_for_multiclass_classification(init_score): + data = np.random.rand(10, 2) + label = np.random.randint(low=0, high=3, size=10) + clf = lgb.LGBMClassifier() + is_wrong_2d_numpy_pandas = isinstance(init_score, (np.ndarray, pd_DataFrame)) and init_score.shape[1] == 2 + is_wrong_2d_list = isinstance(init_score, list) and len(init_score[0]) == 2 + shape_msg = 'Expected init_score to be of shape (10, 3). Got (10, 2).' + if is_wrong_2d_numpy_pandas or is_wrong_2d_list: + with pytest.raises(ValueError) as exc_info: + clf.fit(data, label, init_score=init_score) + assert exc_info.value.args[0] == shape_msg + return + clf.fit(data, label, init_score=init_score) + assert clf.fitted_ From 75bb7eff380e1ebf2b234a19b061cdcfcfd16370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20MZ?= Date: Sat, 3 Apr 2021 16:45:13 -0600 Subject: [PATCH 02/13] check for 1d or 2d collection in init_score --- python-package/lightgbm/basic.py | 45 +++++++++++++++++-------- tests/python_package_test/test_basic.py | 39 +++++++++------------ 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 879cbe3add87..b12c473bcf7d 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -146,6 +146,16 @@ def is_1d_list(data): return isinstance(data, list) and (not data or is_numeric(data[0])) +def is_1d_collection(data): + """Check whether data is a 1-D collection.""" + return ( + is_numpy_1d_array(data) + or is_numpy_column_array(data) + or is_1d_list(data) + or isinstance(data, pd_Series) + ) + + def list_to_1d_numpy(data, dtype=np.float32, name='list'): """Convert data to numpy 1-D array.""" if is_numpy_1d_array(data): @@ -166,13 +176,24 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): def is_numpy_2d_array(data): + """Check whether data is a numpy 2-D array.""" return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1 def is_2d_list(data): + """Check whether data is a 2-D list.""" return isinstance(data, list) and is_1d_list(data[0]) +def is_2d_collection(data): + """Check whether data is a 2-D collection.""" + return ( + is_numpy_2d_array(data) + or is_2d_list(data) + or isinstance(data, pd_DataFrame) + ) + + def data_to_2d_numpy(data, dtype=np.float32, name='list'): """Convert data to numpy 2-D array.""" if is_numpy_2d_array(data): @@ -1862,21 +1883,17 @@ def set_init_score(self, init_score): """ self.init_score = init_score if self.handle is not None and init_score is not None: - try: + if is_1d_collection(init_score): init_score = list_to_1d_numpy(init_score, np.float64, name='init_score') - except TypeError as err: - if self.params.get('num_class', 0) > 1: - init_score = data_to_2d_numpy(init_score, np.float64, name='init_score') - expected_samples, expected_classes = self.num_data(), self.params['num_class'] - n_samples, n_classes = init_score.shape - if n_samples != expected_samples or n_classes != expected_classes: - raise ValueError( - f'Expected init_score to be of shape ({expected_samples}, {expected_classes}). ' - f'Got ({n_samples}, {n_classes}).') - init_score = init_score.ravel() - self.init_score = init_score - else: - raise err + elif is_2d_collection(init_score): + init_score = data_to_2d_numpy(init_score, np.float64, name='init_score') + init_score = init_score.ravel() + self.init_score = init_score + else: + raise TypeError( + 'init_score must be list, numpy 1-D array or pandas Series.\n' + 'In multiclass classification init_score must be list, numpy 2-D array or pandas DataFrame.' + ) self.set_field('init_score', init_score) self.init_score = self.get_field('init_score') # original values can be modified at cpp side return self diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index f8e46e2f7ede..ad8e01a49a75 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -1,6 +1,6 @@ # coding: utf-8 import os -from lightgbm.basic import LightGBMError +from lightgbm.basic import Dataset import numpy as np import pytest @@ -409,28 +409,19 @@ def test_list_to_1d_numpy(y, dtype): assert result.dtype == dtype -@pytest.mark.parametrize( - 'init_score', - [ - np.random.rand(10, 3), - np.random.rand(10, 2), - pd_DataFrame(np.random.rand(10, 3)), - pd_DataFrame(np.random.rand(10, 2)), - [[1, 1, 1] for _ in range(10)], - [[1, 1] for _ in range(10)], - ] -) -def test_init_score_for_multiclass_classification(init_score): +@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list']) +def test_init_score_for_multiclass_classification(init_score_type): + if init_score_type == 'array': + init_score = np.random.rand(10, 3) + elif init_score_type == 'dataframe': + if not PANDAS_INSTALLED: + pytest.skip('Pandas is not installed.') + init_score = pd_DataFrame(np.random.rand(10, 3)) + else: + init_score = [[1, 1, 1] for _ in range(10)] data = np.random.rand(10, 2) label = np.random.randint(low=0, high=3, size=10) - clf = lgb.LGBMClassifier() - is_wrong_2d_numpy_pandas = isinstance(init_score, (np.ndarray, pd_DataFrame)) and init_score.shape[1] == 2 - is_wrong_2d_list = isinstance(init_score, list) and len(init_score[0]) == 2 - shape_msg = 'Expected init_score to be of shape (10, 3). Got (10, 2).' - if is_wrong_2d_numpy_pandas or is_wrong_2d_list: - with pytest.raises(ValueError) as exc_info: - clf.fit(data, label, init_score=init_score) - assert exc_info.value.args[0] == shape_msg - return - clf.fit(data, label, init_score=init_score) - assert clf.fitted_ + ds = Dataset(data, label, init_score=init_score) + ds.construct() + init_score_array = lgb.basic.data_to_2d_numpy(init_score, dtype=np.float64) + np.testing.assert_equal(ds.init_score, init_score_array.ravel()) From cd9a41cac46fabc7b297645632a4c66ef74959cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20MZ?= Date: Sat, 3 Apr 2021 16:48:02 -0600 Subject: [PATCH 03/13] remove dataset import --- tests/python_package_test/test_basic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index ad8e01a49a75..b5f132e4d3d4 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -1,6 +1,5 @@ # coding: utf-8 import os -from lightgbm.basic import Dataset import numpy as np import pytest @@ -421,7 +420,7 @@ def test_init_score_for_multiclass_classification(init_score_type): init_score = [[1, 1, 1] for _ in range(10)] data = np.random.rand(10, 2) label = np.random.randint(low=0, high=3, size=10) - ds = Dataset(data, label, init_score=init_score) + ds = lgb.basic.Dataset(data, label, init_score=init_score) ds.construct() init_score_array = lgb.basic.data_to_2d_numpy(init_score, dtype=np.float64) np.testing.assert_equal(ds.init_score, init_score_array.ravel()) From 564429549486144a4001d507615860d674ccce8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20MZ?= Date: Wed, 23 Jun 2021 18:52:21 -0500 Subject: [PATCH 04/13] initial comments --- python-package/lightgbm/basic.py | 14 +++++++------- tests/python_package_test/test_basic.py | 14 ++++++-------- tests/python_package_test/test_dask.py | 9 ++++----- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b12c473bcf7d..d5406a619dcb 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -202,10 +202,10 @@ def data_to_2d_numpy(data, dtype=np.float32, name='list'): return np.array(data, dtype=dtype) if isinstance(data, pd_DataFrame): if _get_bad_pandas_dtypes(data.dtypes): - raise ValueError('Series.dtypes must be int, float or bool') + raise ValueError('DataFrame.dtypes must be int, float or bool') return cast_numpy_array_to_dtype(data.values, dtype) - raise TypeError("Wrong type({0}) for {1}.\n" - "It should be list, numpy 2-D array or pandas DataFrame".format(type(data).__name__, name)) + raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n" + "It should be list of lists, numpy 2-D array or pandas DataFrame") def cfloat32_array_to_numpy(cptr, length): @@ -1873,7 +1873,8 @@ def set_init_score(self, init_score): Parameters ---------- - init_score : list, numpy 1-D array, pandas Series or None + init_score : list, numpy 1-D array, pandas Series or None. + For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. Init score for Booster. Returns @@ -1887,12 +1888,11 @@ def set_init_score(self, init_score): init_score = list_to_1d_numpy(init_score, np.float64, name='init_score') elif is_2d_collection(init_score): init_score = data_to_2d_numpy(init_score, np.float64, name='init_score') - init_score = init_score.ravel() - self.init_score = init_score + init_score = init_score.ravel(order='F') else: raise TypeError( 'init_score must be list, numpy 1-D array or pandas Series.\n' - 'In multiclass classification init_score must be list, numpy 2-D array or pandas DataFrame.' + 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' ) self.set_field('init_score', init_score) self.init_score = self.get_field('init_score') # original values can be modified at cpp side diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index b5f132e4d3d4..f4ffaf0d1d5e 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -410,17 +410,15 @@ def test_list_to_1d_numpy(y, dtype): @pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list']) def test_init_score_for_multiclass_classification(init_score_type): + init_score = [[0, 1, 2] for _ in range(10)] if init_score_type == 'array': - init_score = np.random.rand(10, 3) + init_score = np.array(init_score) elif init_score_type == 'dataframe': if not PANDAS_INSTALLED: pytest.skip('Pandas is not installed.') - init_score = pd_DataFrame(np.random.rand(10, 3)) - else: - init_score = [[1, 1, 1] for _ in range(10)] + init_score = pd_DataFrame(init_score) data = np.random.rand(10, 2) - label = np.random.randint(low=0, high=3, size=10) - ds = lgb.basic.Dataset(data, label, init_score=init_score) + ds = lgb.basic.Dataset(data, init_score=init_score) ds.construct() - init_score_array = lgb.basic.data_to_2d_numpy(init_score, dtype=np.float64) - np.testing.assert_equal(ds.init_score, init_score_array.ravel()) + expected_init_score = np.hstack([np.repeat(i, 10) for i in range(3)]) + np.testing.assert_equal(ds.init_score, expected_init_score) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 1ed7284ce305..1eb24a227a7c 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1261,17 +1261,16 @@ def test_init_score(task, output, client): 'time_out': 5 } init_score = random.random() - # init_scores must be a 1D array, even for multiclass classification - # where you need to provide 1 score per class for each row in X - # https://github.com/microsoft/LightGBM/issues/4046 size_factor = 1 if task == 'multiclass-classification': size_factor = 3 # number of classes if output.startswith('dataframe'): - init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor)) + # init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor)) + init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size)) else: - init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor)) + # init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor)) + init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) # value of the root node is 0 when init_score is set From e11a44c4232aa46237f7f9e8be1cca94c9f48528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20MZ?= Date: Fri, 25 Jun 2021 18:58:05 -0500 Subject: [PATCH 05/13] update dask test and docstrings --- python-package/lightgbm/basic.py | 4 +++- tests/python_package_test/test_dask.py | 7 ++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 8ebf5a41a033..07bd089e42c3 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -182,7 +182,7 @@ def is_numpy_2d_array(data): def is_2d_list(data): """Check whether data is a 2-D list.""" - return isinstance(data, list) and is_1d_list(data[0]) + return isinstance(data, list) and len(data) > 0 and is_1d_list(data[0]) def is_2d_collection(data): @@ -1114,6 +1114,7 @@ def __init__(self, data, label=None, reference=None, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) + For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. @@ -1531,6 +1532,7 @@ def create_valid(self, data, label=None, weight=None, group=None, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) + For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 10c0bc93ed51..78a47598647c 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1277,17 +1277,14 @@ def test_init_score(task, output, cluster): 'time_out': 5 } init_score = random.random() - # init_scores must be a 1D array, even for multiclass classification - # where you need to provide 1 score per class for each row in X - # https://github.com/microsoft/LightGBM/issues/4046 size_factor = 1 if task == 'multiclass-classification': size_factor = 3 # number of classes if output.startswith('dataframe'): - init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor)) + init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size)) else: - init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor)) + init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) # value of the root node is 0 when init_score is set From a6b174408165e34ddd5c8dabc173acbfe5b69548 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Wed, 7 Jul 2021 22:10:36 -0500 Subject: [PATCH 06/13] update docstrings --- python-package/lightgbm/basic.py | 9 +++------ python-package/lightgbm/sklearn.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 07bd089e42c3..774395e92e86 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1113,8 +1113,7 @@ def __init__(self, data, label=None, reference=None, sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) - For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. @@ -1531,8 +1530,7 @@ def create_valid(self, data, label=None, weight=None, group=None, sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) - For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. @@ -1868,8 +1866,7 @@ def set_init_score(self, init_score): Parameters ---------- - init_score : list, numpy 1-D array, pandas Series or None. - For multiclass classification can also be list of lists, numpy 2-D array or pandas DataFrame. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) Init score for Booster. Returns diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index cb7dd95a92f0..7519022167b4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -717,7 +717,7 @@ def _get_meta_data(collection, name, i): X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", y_shape="array-like of shape = [n_samples]", sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", - init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)", + init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", group_shape="array-like or None, optional (default=None)" ) + "\n\n" + _lgbmmodel_doc_custom_eval_note From ad4495942d486435a7bfa2a5e382dc661c2e6f7a Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Mon, 12 Jul 2021 20:21:18 -0500 Subject: [PATCH 07/13] move logic to set_field. reshape back on get_field --- python-package/lightgbm/basic.py | 50 +++++++++++++++---------- tests/python_package_test/test_basic.py | 4 +- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 774395e92e86..b893baf07b56 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1637,8 +1637,8 @@ def set_field(self, field_name, data): ---------- field_name : string The field name of the information. - data : list, numpy 1-D array, pandas Series or None - The array of data to be set. + data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None + The data to be set. Returns ------- @@ -1656,12 +1656,19 @@ def set_field(self, field_name, data): ctypes.c_int(0), ctypes.c_int(FIELD_TYPE_MAPPER[field_name]))) return self - dtype = np.float32 if field_name == 'group': - dtype = np.int32 + data = list_to_1d_numpy(data, np.int32, name='group') elif field_name == 'init_score': - dtype = np.float64 - data = list_to_1d_numpy(data, dtype, name=field_name) + if is_1d_collection(data): + data = list_to_1d_numpy(data, np.float64, name='init_score') + elif is_2d_collection(data): + data = data_to_2d_numpy(data, np.float64, name='init_score') + data = data.ravel(order='F') + else: + raise TypeError( + 'init_score must be list, numpy 1-D array or pandas Series.\n' + 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' + ) if data.dtype == np.float32 or data.dtype == np.float64: ptr_data, type_data, _ = c_float_array(data) elif data.dtype == np.int32: @@ -1708,13 +1715,26 @@ def get_field(self, field_name): if tmp_out_len.value == 0: return None if out_type.value == C_API_DTYPE_INT32: - return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) + arr = cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) elif out_type.value == C_API_DTYPE_FLOAT32: - return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) + arr = cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) elif out_type.value == C_API_DTYPE_FLOAT64: - return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) + arr = cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) else: raise TypeError("Unknown type") + if field_name == 'init_score': + ptr_num_data = ctypes.c_int(0) + _safe_call( + _LIB.LGBM_DatasetGetNumData( + self.handle, + ctypes.byref(ptr_num_data), + ) + ) + num_data = ptr_num_data.value + num_classes = arr.size // num_data + if num_classes > 1: + arr = arr.reshape((num_data, num_classes), order='F') + return arr def set_categorical_feature(self, categorical_feature): """Set categorical features. @@ -1866,7 +1886,7 @@ def set_init_score(self, init_score): Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None Init score for Booster. Returns @@ -1876,16 +1896,6 @@ def set_init_score(self, init_score): """ self.init_score = init_score if self.handle is not None and init_score is not None: - if is_1d_collection(init_score): - init_score = list_to_1d_numpy(init_score, np.float64, name='init_score') - elif is_2d_collection(init_score): - init_score = data_to_2d_numpy(init_score, np.float64, name='init_score') - init_score = init_score.ravel(order='F') - else: - raise TypeError( - 'init_score must be list, numpy 1-D array or pandas Series.\n' - 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' - ) self.set_field('init_score', init_score) self.init_score = self.get_field('init_score') # original values can be modified at cpp side return self diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d96036a6f735..5d907702a0de 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -425,5 +425,5 @@ def test_init_score_for_multiclass_classification(init_score_type): data = np.random.rand(10, 2) ds = lgb.basic.Dataset(data, init_score=init_score) ds.construct() - expected_init_score = np.hstack([np.repeat(i, 10) for i in range(3)]) - np.testing.assert_equal(ds.init_score, expected_init_score) + np.testing.assert_equal(ds.init_score, init_score) + np.testing.assert_equal(ds.get_field('init_score'), init_score) From 2c7ef3c64ee61cc2b00506a34fce2ac2db971774 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Tue, 24 Aug 2021 22:00:32 -0500 Subject: [PATCH 08/13] add type hints and update docstrings for dask. fix Dataset.set_field --- python-package/lightgbm/basic.py | 22 +++++++++------------ python-package/lightgbm/dask.py | 26 ++++++++++++------------- tests/python_package_test/test_basic.py | 7 +++---- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index bdc502a30028..fdc2df2d92c5 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1951,19 +1951,22 @@ def set_field(self, field_name, data): ctypes.c_int(0), ctypes.c_int(FIELD_TYPE_MAPPER[field_name]))) return self - if field_name == 'group': - data = list_to_1d_numpy(data, np.int32, name='group') - elif field_name == 'init_score': + if field_name == 'init_score': + dtype = np.float64 if is_1d_collection(data): - data = list_to_1d_numpy(data, np.float64, name='init_score') + data = list_to_1d_numpy(data, dtype, name=field_name) elif is_2d_collection(data): - data = data_to_2d_numpy(data, np.float64, name='init_score') + data = data_to_2d_numpy(data, dtype, name=field_name) data = data.ravel(order='F') else: raise TypeError( 'init_score must be list, numpy 1-D array or pandas Series.\n' 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' ) + else: + dtype = np.int32 if field_name == 'group' else np.float32 + data = list_to_1d_numpy(data, dtype, name=field_name) + if data.dtype == np.float32 or data.dtype == np.float64: ptr_data, type_data, _ = c_float_array(data) elif data.dtype == np.int32: @@ -2018,14 +2021,7 @@ def get_field(self, field_name): else: raise TypeError("Unknown type") if field_name == 'init_score': - ptr_num_data = ctypes.c_int(0) - _safe_call( - _LIB.LGBM_DatasetGetNumData( - self.handle, - ctypes.byref(ptr_num_data), - ) - ) - num_data = ptr_num_data.value + num_data = self.num_data() num_classes = arr.size // num_data if num_classes > 1: arr = arr.reshape((num_data, num_classes), order='F') diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index bbd2ce4e07e8..38124b87dc88 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -375,7 +375,7 @@ def _train( params: Dict[str, Any], model_factory: Type[LGBMModel], sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, group: Optional[_DaskVectorLike] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, @@ -403,7 +403,7 @@ def _train( Class of the local underlying model. sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) Weights of training data. - init_score : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) + init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None) Init score of training data. group : Dask Array or Dask Series or None, optional (default=None) Group/query data. @@ -422,7 +422,7 @@ def _train( Weights for each validation set in eval_set. eval_class_weight : list of dict or str, or None, optional (default=None) Class weights, one dict or str for each validation set in eval_set. - eval_init_score : list of Dask Arrays, Dask Series or None, optional (default=None) + eval_init_score : list of Dask Arrays, Dask Series, Dask DataFrames (for multi-class task) or None, optional (default=None) Initial model score for each validation set in eval_set. eval_group : list of Dask Arrays, Dask Series or None, optional (default=None) Group/query for each validation set in eval_set. @@ -1014,7 +1014,7 @@ def _lgb_dask_fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, group: Optional[_DaskVectorLike] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, @@ -1152,7 +1152,7 @@ def fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, eval_sample_weight: Optional[List[_DaskCollection]] = None, @@ -1185,10 +1185,10 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", - eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", + eval_init_score_shape="list of Dask Arrays, Dask Series, Dask DataFrames or None, optional (default=None)", eval_group_shape="list of Dask Arrays or Dask Series or None, optional (default=None)" ) @@ -1331,7 +1331,7 @@ def fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, eval_sample_weight: Optional[List[_DaskCollection]] = None, @@ -1362,10 +1362,10 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", - eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", + eval_init_score_shape="list of Dask Arrays, Dask Series, Dask DataFrames or None, optional (default=None)", eval_group_shape="list of Dask Arrays or Dask Series or None, optional (default=None)" ) @@ -1492,7 +1492,7 @@ def fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, group: Optional[_DaskVectorLike] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, @@ -1529,10 +1529,10 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", - eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", + eval_init_score_shape="list of Dask Arrays, Dask Series, Dask DataFrames or None, optional (default=None)", eval_group_shape="list of Dask Arrays or Dask Series or None, optional (default=None)" ) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d8cff28a506c..199cbf644e39 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -519,7 +519,7 @@ def test_list_to_1d_numpy(y, dtype): @pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list']) def test_init_score_for_multiclass_classification(init_score_type): - init_score = [[0, 1, 2] for _ in range(10)] + init_score = [[i * 10 + j for j in range(3)] for i in range(10)] if init_score_type == 'array': init_score = np.array(init_score) elif init_score_type == 'dataframe': @@ -527,7 +527,6 @@ def test_init_score_for_multiclass_classification(init_score_type): pytest.skip('Pandas is not installed.') init_score = pd_DataFrame(init_score) data = np.random.rand(10, 2) - ds = lgb.basic.Dataset(data, init_score=init_score) - ds.construct() - np.testing.assert_equal(ds.init_score, init_score) + ds = lgb.basic.Dataset(data, init_score=init_score).construct() np.testing.assert_equal(ds.get_field('init_score'), init_score) + np.testing.assert_equal(ds.init_score, init_score) From 1676e558407bf3ea57905772db302926f4d3c496 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Wed, 1 Sep 2021 20:32:15 -0500 Subject: [PATCH 09/13] revert wrong docstrings and type hints --- python-package/lightgbm/dask.py | 8 ++++---- tests/python_package_test/test_basic.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index c07ab733ee68..8f55b97a1f2f 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1338,7 +1338,7 @@ def fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskCollection] = None, + init_score: Optional[_DaskVectorLike] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, eval_sample_weight: Optional[List[_DaskCollection]] = None, @@ -1369,7 +1369,7 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", eval_init_score_shape="list of Dask Arrays, Dask Series, Dask DataFrames or None, optional (default=None)", @@ -1499,7 +1499,7 @@ def fit( X: _DaskMatrixLike, y: _DaskCollection, sample_weight: Optional[_DaskVectorLike] = None, - init_score: Optional[_DaskCollection] = None, + init_score: Optional[_DaskVectorLike] = None, group: Optional[_DaskVectorLike] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_names: Optional[List[str]] = None, @@ -1536,7 +1536,7 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Arrays or Dask Series or None, optional (default=None)", eval_init_score_shape="list of Dask Arrays, Dask Series, Dask DataFrames or None, optional (default=None)", diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index be937e8b50df..6117eca13489 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -550,6 +550,6 @@ def test_init_score_for_multiclass_classification(init_score_type): pytest.skip('Pandas is not installed.') init_score = pd_DataFrame(init_score) data = np.random.rand(10, 2) - ds = lgb.basic.Dataset(data, init_score=init_score).construct() + ds = lgb.Dataset(data, init_score=init_score).construct() np.testing.assert_equal(ds.get_field('init_score'), init_score) np.testing.assert_equal(ds.init_score, init_score) From 9bb44548abadc4c05385cc7bf5b5e0eaab661936 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Wed, 1 Sep 2021 20:57:58 -0500 Subject: [PATCH 10/13] add extra comma for consistency --- python-package/lightgbm/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index c8638fb3ceff..d9917bd5bd00 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -420,7 +420,7 @@ def _train( Class of the local underlying model. sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) Weights of training data. - init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None) + init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None) Init score of training data. group : Dask Array or Dask Series or None, optional (default=None) Group/query data. @@ -1192,7 +1192,7 @@ def fit( X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", - init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multiclass-task), or None, optional (default=None)", From a3e890d608c72c1c30437b4b1363edc0800c92d3 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Sat, 4 Sep 2021 21:22:10 -0500 Subject: [PATCH 11/13] prefix private functions with underscore add type hints to new functions make commas consistent in dask and basic --- python-package/lightgbm/basic.py | 34 ++++++++++++++++---------------- python-package/lightgbm/dask.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 76c7ac768c81..45c21aa6b3a5 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -161,7 +161,7 @@ def is_1d_list(data): return isinstance(data, list) and (not data or is_numeric(data[0])) -def is_1d_collection(data): +def _is_1d_collection(data: Any) -> bool: """Check whether data is a 1-D collection.""" return ( is_numpy_1d_array(data) @@ -190,30 +190,30 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): "It should be list, numpy 1-D array or pandas Series") -def is_numpy_2d_array(data): +def _is_numpy_2d_array(data: Any) -> bool: """Check whether data is a numpy 2-D array.""" return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1 -def is_2d_list(data): +def _is_2d_list(data: Any) -> bool: """Check whether data is a 2-D list.""" return isinstance(data, list) and len(data) > 0 and is_1d_list(data[0]) -def is_2d_collection(data): +def _is_2d_collection(data: Any) -> bool: """Check whether data is a 2-D collection.""" return ( - is_numpy_2d_array(data) - or is_2d_list(data) - or isinstance(data, pd_DataFrame) + _is_numpy_2d_array(data) + or _is_2d_list(data) + or (isinstance(data, pd_DataFrame) and data.shape[1] > 1) ) -def data_to_2d_numpy(data, dtype=np.float32, name='list'): +def _data_to_2d_numpy(data: Any, dtype: type = np.float32, name: str='list') -> np.ndarray: """Convert data to numpy 2-D array.""" - if is_numpy_2d_array(data): + if _is_numpy_2d_array(data): return cast_numpy_array_to_dtype(data, dtype) - if is_2d_list(data): + if _is_2d_list(data): return np.array(data, dtype=dtype) if isinstance(data, pd_DataFrame): if _get_bad_pandas_dtypes(data.dtypes): @@ -1188,7 +1188,7 @@ def __init__(self, data, label=None, reference=None, sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. @@ -1830,7 +1830,7 @@ def create_valid(self, data, label=None, weight=None, group=None, sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) Init score for Dataset. silent : bool, optional (default=False) Whether to print messages during construction. @@ -1937,7 +1937,7 @@ def set_field(self, field_name, data): ---------- field_name : str The field name of the information. - data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None + data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None The data to be set. Returns @@ -1958,10 +1958,10 @@ def set_field(self, field_name, data): return self if field_name == 'init_score': dtype = np.float64 - if is_1d_collection(data): + if _is_1d_collection(data): data = list_to_1d_numpy(data, dtype, name=field_name) - elif is_2d_collection(data): - data = data_to_2d_numpy(data, dtype, name=field_name) + elif _is_2d_collection(data): + data = _data_to_2d_numpy(data, dtype, name=field_name) data = data.ravel(order='F') else: raise TypeError( @@ -2182,7 +2182,7 @@ def set_init_score(self, init_score): Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None Init score for Booster. Returns diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index d9917bd5bd00..f0a4d5b0a556 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1195,7 +1195,7 @@ def fit( init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", - eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multiclass-task), or None, optional (default=None)", + eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)", eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" ) From 94aa6a94a708b04f5bf9935d161a2b6349c0468a Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Sat, 4 Sep 2021 21:30:00 -0500 Subject: [PATCH 12/13] add missing spaces after type hint --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 45c21aa6b3a5..aed363ba2907 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -209,7 +209,7 @@ def _is_2d_collection(data: Any) -> bool: ) -def _data_to_2d_numpy(data: Any, dtype: type = np.float32, name: str='list') -> np.ndarray: +def _data_to_2d_numpy(data: Any, dtype: type = np.float32, name: str = 'list') -> np.ndarray: """Convert data to numpy 2-D array.""" if _is_numpy_2d_array(data): return cast_numpy_array_to_dtype(data, dtype) From 9fb7f2bd49d268bed6528fc71fa2b0a98dbc7c73 Mon Sep 17 00:00:00 2001 From: Jose Morales Date: Sun, 5 Sep 2021 11:28:07 -0500 Subject: [PATCH 13/13] remove shape condition for dataframe in is_2d_collection --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index aed363ba2907..5994915511db 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -205,7 +205,7 @@ def _is_2d_collection(data: Any) -> bool: return ( _is_numpy_2d_array(data) or _is_2d_list(data) - or (isinstance(data, pd_DataFrame) and data.shape[1] > 1) + or isinstance(data, pd_DataFrame) )