Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] Support 2d collections as input for init_score in multiclass classification task #4150

Merged
merged 19 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 60 additions & 8 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def is_numpy_column_array(data):
return len(shape) == 2 and shape[1] == 1


def cast_numpy_1d_array_to_dtype(array, dtype):
"""Cast numpy 1d array to given dtype."""
def cast_numpy_array_to_dtype(array, dtype):
"""Cast numpy array to given dtype."""
if array.dtype == dtype:
return array
return array.astype(dtype=dtype, copy=False)
Expand All @@ -146,14 +146,24 @@ def is_1d_list(data):
return isinstance(data, list) and (not data or is_numeric(data[0]))


def is_1d_collection(data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def is_1d_collection(data):
def _is_1d_collection(data: Any) -> bool:

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should be adding type hints for new code when possible, to increase the chance of catching bugs with mypy and reduce the amount of effort needed for #3756.

I'd also like to recommend prefixing objects that we don't want to encourage people to import with _, to make it clearer that they're intended to be internal

"""Check whether data is a 1-D collection."""
return (
is_numpy_1d_array(data)
or is_numpy_column_array(data)
or is_1d_list(data)
or isinstance(data, pd_Series)
)


def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""Convert data to numpy 1-D array."""
if is_numpy_1d_array(data):
return cast_numpy_1d_array_to_dtype(data, dtype)
return cast_numpy_array_to_dtype(data, dtype)
elif is_numpy_column_array(data):
_log_warning('Converting column-vector to 1d array')
array = data.ravel()
return cast_numpy_1d_array_to_dtype(array, dtype)
return cast_numpy_array_to_dtype(array, dtype)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, pd_Series):
Expand All @@ -165,6 +175,39 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"It should be list, numpy 1-D array or pandas Series")


def is_numpy_2d_array(data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def is_numpy_2d_array(data):
def _is_numpy_2d_array(data: Any) -> bool:

"""Check whether data is a numpy 2-D array."""
return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1


def is_2d_list(data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def is_2d_list(data):
def _is_2d_list(data: Any) -> bool:

"""Check whether data is a 2-D list."""
return isinstance(data, list) and len(data) > 0 and is_1d_list(data[0])
jameslamb marked this conversation as resolved.
Show resolved Hide resolved


def is_2d_collection(data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def is_2d_collection(data):
def _is_2d_collection(data: Any) -> bool:

"""Check whether data is a 2-D collection."""
return (
is_numpy_2d_array(data)
or is_2d_list(data)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
is_numpy_2d_array(data)
or is_2d_list(data)
_is_numpy_2d_array(data)
or _is_2d_list(data)

or isinstance(data, pd_DataFrame)
)


def data_to_2d_numpy(data, dtype=np.float32, name='list'):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def data_to_2d_numpy(data, dtype=np.float32, name='list'):
def _data_to_2d_numpy(data, dtype=np.float32, name='list'):

Could you also add type hints here?

"""Convert data to numpy 2-D array."""
if is_numpy_2d_array(data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if is_numpy_2d_array(data):
if _is_numpy_2d_array(data):

return cast_numpy_array_to_dtype(data, dtype)
if is_2d_list(data):
return np.array(data, dtype=dtype)
if isinstance(data, pd_DataFrame):
if _get_bad_pandas_dtypes(data.dtypes):
raise ValueError('DataFrame.dtypes must be int, float or bool')
return cast_numpy_array_to_dtype(data.values, dtype)
raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n"
"It should be list of lists, numpy 2-D array or pandas DataFrame")


def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
Expand Down Expand Up @@ -1070,7 +1113,7 @@ def __init__(self, data, label=None, reference=None,
sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@StrikerRUS do you think this should have a comma before or None? I did include it in the dask docstrings but I just realized this doesn't have it. I'll make them consistent but would like to know what you think is the correct one.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like how you did it. I believe a comma before or None prevents users from thinking that it possible to include Nones into a list: #4557 (comment). However, I'm not sure whether it is grammatically correct or not. @jameslamb should know for sure 🙂

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a personally prefer a, b, c, or None, optional (default=None) (with the ,beforeor None`), but both are equally valid and I don't think you need to change anything

Init score for Dataset.
silent : bool, optional (default=False)
Whether to print messages during construction.
Expand Down Expand Up @@ -1487,7 +1530,7 @@ def create_valid(self, data, label=None, weight=None, group=None,
sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None)
Init score for Dataset.
silent : bool, optional (default=False)
Whether to print messages during construction.
Expand Down Expand Up @@ -1823,7 +1866,7 @@ def set_init_score(self, init_score):

Parameters
----------
init_score : list, numpy 1-D array, pandas Series or None
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task) or None, optional (default=None)
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
Init score for Booster.

Returns
Expand All @@ -1833,7 +1876,16 @@ def set_init_score(self, init_score):
"""
self.init_score = init_score
if self.handle is not None and init_score is not None:
init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
if is_1d_collection(init_score):
init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
elif is_2d_collection(init_score):
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
init_score = data_to_2d_numpy(init_score, np.float64, name='init_score')
init_score = init_score.ravel(order='F')
else:
raise TypeError(
'init_score must be list, numpy 1-D array or pandas Series.\n'
'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.'
)
self.set_field('init_score', init_score)
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
self.init_score = self.get_field('init_score') # original values can be modified at cpp side
return self
Expand Down
2 changes: 1 addition & 1 deletion python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ def _get_meta_data(collection, name, i):
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
y_shape="array-like of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="array-like or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note

Expand Down
18 changes: 17 additions & 1 deletion tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.model_selection import train_test_split
jmoralez marked this conversation as resolved.
Show resolved Hide resolved

jmoralez marked this conversation as resolved.
Show resolved Hide resolved
import lightgbm as lgb
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
from lightgbm.compat import PANDAS_INSTALLED, pd_Series
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
jmoralez marked this conversation as resolved.
Show resolved Hide resolved

jmoralez marked this conversation as resolved.
Show resolved Hide resolved
from .utils import load_breast_cancer
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved

Expand Down Expand Up @@ -411,3 +411,19 @@ def test_list_to_1d_numpy(y, dtype):
result = lgb.basic.list_to_1d_numpy(y, dtype=dtype)
assert result.size == 10
assert result.dtype == dtype


@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list'])
def test_init_score_for_multiclass_classification(init_score_type):
init_score = [[0, 1, 2] for _ in range(10)]
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
if init_score_type == 'array':
init_score = np.array(init_score)
elif init_score_type == 'dataframe':
if not PANDAS_INSTALLED:
pytest.skip('Pandas is not installed.')
init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2)
ds = lgb.basic.Dataset(data, init_score=init_score)
ds.construct()
jmoralez marked this conversation as resolved.
Show resolved Hide resolved
expected_init_score = np.hstack([np.repeat(i, 10) for i in range(3)])
np.testing.assert_equal(ds.init_score, expected_init_score)
7 changes: 2 additions & 5 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,17 +1277,14 @@ def test_init_score(task, output, cluster):
'time_out': 5
}
init_score = random.random()
# init_scores must be a 1D array, even for multiclass classification
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we need updates in type hints and docstrings for Dask module.

init_score: Optional[_DaskVectorLike] = None,

eval_init_score: Optional[List[_DaskCollection]] = None,

init_score : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)
Init score of training data.

eval_init_score : list of Dask Arrays, Dask Series or None, optional (default=None)
Initial model score for each validation set in eval_set.

init_score: Optional[_DaskVectorLike] = None,

eval_init_score: Optional[List[_DaskCollection]] = None,

init_score: Optional[_DaskVectorLike] = None,

eval_init_score: Optional[List[_DaskCollection]] = None,

init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)",

eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)",

init_score: Optional[_DaskVectorLike] = None,

eval_init_score: Optional[List[_DaskCollection]] = None,

init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)",

eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)",

init_score: Optional[_DaskVectorLike] = None,

eval_init_score: Optional[List[_DaskCollection]] = None,

init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)",

eval_init_score_shape="list of Dask Arrays or Dask Series or None, optional (default=None)",

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 2c7ef3c. I think the docstrings maybe ended up a bit too verbose, let me know what you think.

Copy link
Collaborator

@StrikerRUS StrikerRUS Sep 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jmoralez Thanks a lot! I like your explicit wordings.
I'm so sorry, I merged #4558 and introduced conflicts.

Also, I just understood... There shouldn't be any updates for DaskLGBMRegressor's and DaskLGBMRanker's docstrings and type annotations, (for multi-class task) is not applicable there.

# where you need to provide 1 score per class for each row in X
# https://github.com/microsoft/LightGBM/issues/4046
size_factor = 1
if task == 'multiclass-classification':
size_factor = 3 # number of classes

if output.startswith('dataframe'):
init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor))
init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size))
else:
init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor))
init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
Expand Down