microsoft · jameslamb · Jan 22, 2021 · Jan 1, 2021 · Jan 2, 2021 · Jan 4, 2021
diff --git a/.ci/test.sh b/.ci/test.sh
@@ -52,7 +52,7 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "lint" ]]; then
             "r-lintr>=2.0"
     pip install --user cpplint
     echo "Linting Python code"
-    pycodestyle --ignore=E501,W503 --exclude=./compute,./eigen,./.nuget,./external_libs . || exit -1
+    pycodestyle --ignore=E402,E501,W503 --exclude=./compute,./eigen,./.nuget,./external_libs . || exit -1
     pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|^eigen|external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
     echo "Linting R code"
     Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1

@@ -16,7 +16,7 @@
 from dask.distributed import default_client, get_worker, wait
 
 from .basic import _LIB, _safe_call
-from .sklearn import LGBMClassifier, LGBMRegressor
+from .sklearn import LGBMClassifier, LGBMRegressor, LGBMRanker
 
 import scipy.sparse as ss
 
@@ -68,15 +68,22 @@ def _train_part(params, model_factory, list_of_parts, worker_addresses, return_m
     network_params = _build_network_params(worker_addresses, get_worker().address, local_listen_port, time_out)
     params.update(network_params)
 
+    is_ranker = model_factory.__qualname__ == 'LGBMRanker'
+
     # Concatenate many parts into one
-    parts = tuple(zip(*list_of_parts))
-    data = _concat(parts[0])
-    label = _concat(parts[1])
-    weight = _concat(parts[2]) if len(parts) == 3 else None
+    data = _concat([d['X'] for d in list_of_parts])
+    label = _concat([d['y'] for d in list_of_parts])
+    weight = _concat([d['weight'] for d in list_of_parts]) if 'weight' in list_of_parts[0] else None
 
     try:
         model = model_factory(**params)
-        model.fit(data, label, sample_weight=weight, **kwargs)
+
+        if is_ranker:
+            group = _concat([d['group'] for d in list_of_parts])
+            model.fit(data, y=label, sample_weight=weight, group=group, **kwargs)
+        else:
+            model.fit(data, y=label, sample_weight=weight, **kwargs)
+
     finally:
         _safe_call(_LIB.LGBM_NetworkFree())
 
@@ -91,7 +98,7 @@ def _split_to_parts(data, is_matrix):
     return parts
 
 
-def _train(client, data, label, params, model_factory, weight=None, **kwargs):
+def _train(client, data, label, params, model_factory, sample_weight=None, group=None, **kwargs):
     """Inner train routine.
 
     Parameters
@@ -102,20 +109,30 @@ def _train(client, data, label, params, model_factory, weight=None, **kwargs):
     y : dask array of shape = [n_samples]
         The target values (class labels in classification, real numbers in regression).
     params : dict
-    model_factory : lightgbm.LGBMClassifier or lightgbm.LGBMRegressor class
+    model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
     sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
-            Weights of training data.
+        Weights of training data.
+    group : array-like
+        Group/query data, used for ranking task. sum(group) = n_samples.
     """
-    # Split arrays/dataframes into parts. Arrange parts into tuples to enforce co-locality
+    # Split arrays/dataframes into parts. Arrange parts into dicts to enforce co-locality
     data_parts = _split_to_parts(data, is_matrix=True)
     label_parts = _split_to_parts(label, is_matrix=False)
-    if weight is None:
-        parts = list(map(delayed, zip(data_parts, label_parts)))
-    else:
-        weight_parts = _split_to_parts(weight, is_matrix=False)
-        parts = list(map(delayed, zip(data_parts, label_parts, weight_parts)))
+    parts = [{'X': x, 'y': y} for (x, y) in zip(data_parts, label_parts)]
+
+    # append weight, group vectors to part dicts when needed.
+    if sample_weight is not None:
+        weight_parts = _split_to_parts(sample_weight, is_matrix=False)
+        for i, d in enumerate(parts):
+            parts[i] = {**d, 'weight': weight_parts[i]}
+
+    if group is not None:
+        group_parts = _split_to_parts(group, is_matrix=False)
+        for i, d in enumerate(parts):
+            parts[i] = {**d, 'group': group_parts[i]}
 
     # Start computation in the background
+    parts = list(map(delayed, parts))
     parts = client.compute(parts)
     wait(parts)
 
@@ -179,7 +196,7 @@ def _predict(model, data, proba=False, dtype=np.float32, **kwargs):
 
     Parameters
     ----------
-    model :
+    model : local lightgbm.LGBM[Classifier/Regressor/Ranker]
     data : dask array of shape = [n_samples, n_features]
         Input feature matrix.
     proba : bool
@@ -202,13 +219,13 @@ def _predict(model, data, proba=False, dtype=np.float32, **kwargs):
 
 class _LGBMModel:
 
-    def _fit(self, model_factory, X, y=None, sample_weight=None, client=None, **kwargs):
+    def _fit(self, model_factory, X, y=None, sample_weight=None, group=None, client=None, **kwargs):
         """Docstring is inherited from the LGBMModel."""
         if client is None:
             client = default_client()
 
         params = self.get_params(True)
-        model = _train(client, X, y, params, model_factory, sample_weight, **kwargs)
+        model = _train(client, X, y, params, model_factory, sample_weight, group, **kwargs)
 
         self.set_params(**model.get_params())
         self._copy_extra_params(model, self)
@@ -233,8 +250,8 @@ class DaskLGBMClassifier(_LGBMModel, LGBMClassifier):
     """Distributed version of lightgbm.LGBMClassifier."""
 
     def fit(self, X, y=None, sample_weight=None, client=None, **kwargs):
-        """Docstring is inherited from the LGBMModel."""
-        return self._fit(LGBMClassifier, X, y, sample_weight, client, **kwargs)
+        """Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
+        return self._fit(LGBMClassifier, X=X, y=y, sample_weight=sample_weight, client=client, **kwargs)
     fit.__doc__ = LGBMClassifier.fit.__doc__
 
     def predict(self, X, **kwargs):
@@ -262,7 +279,7 @@ class DaskLGBMRegressor(_LGBMModel, LGBMRegressor):
 
     def fit(self, X, y=None, sample_weight=None, client=None, **kwargs):
         """Docstring is inherited from the lightgbm.LGBMRegressor.fit."""
-        return self._fit(LGBMRegressor, X, y, sample_weight, client, **kwargs)
+        return self._fit(LGBMRegressor, X=X, y=y, sample_weight=sample_weight, client=client, **kwargs)
     fit.__doc__ = LGBMRegressor.fit.__doc__
 
     def predict(self, X, **kwargs):
@@ -278,3 +295,26 @@ def to_local(self):
         model : lightgbm.LGBMRegressor
         """
         return self._to_local(LGBMRegressor)
+
+
+class DaskLGBMRanker(_LGBMModel, LGBMRanker):
+    """Docstring is inherited from the lightgbm.LGBMRanker."""
+
+    def fit(self, X, y=None, sample_weight=None, group=None, client=None, **kwargs):
         init_score : array-like of shape = [n_samples] or None, optional (default=None) 
             Init score of training data. 
 sample_weight=None, init_score=None, group=None, 
         init_score : array-like of shape = [n_samples] or None, optional (default=None) 
             Init score of training data. 
 sample_weight=None, init_score=None, group=None, 
+        """Docstring is inherited from the lightgbm.LGBMRanker.fit."""
+        return self._fit(LGBMRanker, X=X, y=y, sample_weight=sample_weight, group=group, client=client, **kwargs)
+    fit.__doc__ = LGBMRanker.fit.__doc__
+
+    def predict(self, X, **kwargs):
+        """Docstring is inherited from the lightgbm.LGBMRanker.predict."""
+        return _predict(self.to_local(), X, **kwargs)
+    predict.__doc__ = LGBMRanker.predict.__doc__
+
+    def to_local(self):
+        """Create regular version of lightgbm.LGBMRanker from the distributed version.
+
+        Returns
+        -------
+        model : lightgbm.LGBMRanker
+        """
+        return self._to_local(LGBMRanker)
@@ -1,20 +1,28 @@
 # coding: utf-8
+"""Tests for lightgbm.dask module
+
+An easy way to run these tests is from the (python) docker container.
+> python -m pytest /LightGBM/tests/python_package_test/test_dask.py
+"""
 import os
+import itertools
 import sys
+import time
 
 import pytest
-if not sys.platform.startswith("linux"):
-    pytest.skip("lightgbm.dask is currently supported in Linux environments", allow_module_level=True)
+if not sys.platform.startswith('linux'):
+    pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
 
 import dask.array as da
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
+from scipy.stats import spearmanr
 import scipy.sparse
 from dask.array.utils import assert_eq
-from dask_ml.metrics import accuracy_score, r2_score
 from distributed.utils_test import client, cluster_fixture, gen_cluster, loop
 from sklearn.datasets import make_blobs, make_regression
+from sklearn.utils import check_random_state
 
 import lightgbm
 import lightgbm.dask as dlgbm
@@ -23,7 +31,7 @@
 data_centers = [[[-4, -4], [4, 4]], [[-4, -4], [4, 4], [-4, 4]]]
 
 pytestmark = [
-    pytest.mark.skipif(os.getenv("TASK", "") == "mpi", reason="Fails to run with MPI interface")
+    pytest.mark.skipif(os.getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface')
 ]
 
 
@@ -36,6 +44,102 @@ def listen_port():
 listen_port.port = 13000
 
 
+def r2_score(dy_true, dy_pred):
+    """Helper function taken from dask_ml.metrics: computes coefficient of determination."""
+    numerator = ((dy_true - dy_pred) ** 2).sum(axis=0)
+    denominator = ((dy_true - dy_pred.mean(axis=0)) ** 2).sum(axis=0)
+    return (1 - numerator / denominator).compute()
 conda install -q -y -n $CONDA_ENV dask dask-ml distributed joblib matplotlib numpy pandas psutil pytest scikit-learn scipy 
 conda install -q -y -n $CONDA_ENV dask dask-ml distributed joblib matplotlib numpy pandas psutil pytest scikit-learn scipy 
+
+
+def _make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=1, random_gs=False, avg_gs=10, random_state=0):
+    """Generate a learning-to-rank dataset - feature vectors grouped together with
+    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
+    if ranking objective becomes supported in sklearn.datasets module."""
+    rnd_generator = check_random_state(random_state)
+
+    y_vec, group_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
+    gid = 0
+
+    # build target, group ID vectors.
+    relvalues = range(gmax + 1)
+    while len(y_vec) < n_samples:
+        gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)
+        if not gsize:
+            continue
+
+        rel = rnd_generator.choice(relvalues, size=gsize, replace=True)
+        y_vec = np.append(y_vec, rel)
+        group_vec = np.append(group_vec, [gid] * gsize)
+        gid += 1
+
+    y_vec, group_vec = y_vec[0:n_samples], group_vec[0:n_samples]
+
+    # build feature data, X. Transform first few into informative features.
+    n_informative = max(min(n_features, n_informative), 0)
+    x_grid = np.linspace(0, stop=1, num=gmax + 2)
+    X = rnd_generator.uniform(size=(n_samples, n_features))
+
+    # make first n_informative features values bucketed according to relevance scores.
+    def bucket_fn(z):
+        return rnd_generator.uniform(x_grid[z], high=x_grid[z + 1])
+
+    for j in range(n_informative):
+        bias, coef = rnd_generator.normal(size=2)
+        X[:, j] = bias + coef * np.apply_along_axis(bucket_fn, axis=0, arr=y_vec)
+
+    return X, y_vec, group_vec
+
+
+def _create_ranking_data(n_samples=100, output='array', chunk_size=50):
+    X, y, g = _make_ranking(n_samples=n_samples, random_state=42)
+    rnd = np.random.RandomState(42)
+    w = rnd.rand(X.shape[0]) * 0.01
+    g_rle = np.array([sum([1 for _ in grp]) for _, grp in itertools.groupby(g)])
+
+    if output == 'dataframe':
+
+        # add target, weight, and group to DataFrame so that partitions abide by group boundaries.
+        X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
+        X = X_df.copy()
+        X_df = X_df.assign(y=y, g=g, w=w)
+
+        # set_index ensures partitions are based on group id. See https://bit.ly/3pAWyNw.
+        X_df.set_index('g', inplace=True)
+        dX = dd.from_pandas(X_df, chunksize=chunk_size)
+
+        # separate target, weight from features.
+        dy = dX['y']
+        dw = dX['w']
+        dX = dX.drop(columns=['y', 'w'])
+        dg = dX.index.to_series()
+
+        # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
+        # so that within each partition, sum(g) = n_samples.
+        dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0]))
+
+    elif output == 'array':
+
+        # ranking arrays: one chunk per group. Each chunk must include all columns.
+        p = X.shape[1]
+        dX, dy, dw, dg = list(), list(), list(), list()
+        for g_idx, rhs in enumerate(np.cumsum(g_rle)):
+            lhs = rhs - g_rle[g_idx]
+            dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))
+            dy.append(da.from_array(y[lhs:rhs]))
+            dw.append(da.from_array(w[lhs:rhs]))
+            dg.append(da.from_array(np.array([g_rle[g_idx]])))
+
+        dX = da.concatenate(dX, axis=0)
+        dy = da.concatenate(dy, axis=0)
+        dw = da.concatenate(dw, axis=0)
+        dg = da.concatenate(dg, axis=0)
+
+    else:
+        raise ValueError('ranking data creation only supported for Dask arrays and dataframes')
+
+    return X, y, w, g_rle, dX, dy, dw, dg
+
+
 def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size=50):
     if objective == 'classification':
         X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
@@ -74,15 +178,14 @@ def test_classifier(output, centers, client, listen_port):
     dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_listen_port=listen_port)
     dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw, client=client)
     p1 = dask_classifier.predict(dX)
-    s1 = accuracy_score(dy, p1)
+    s1 = da.average(dy == p1).compute()
     p1 = p1.compute()
 
     local_classifier = lightgbm.LGBMClassifier()
     local_classifier.fit(X, y, sample_weight=w)
     p2 = local_classifier.predict(X)
     s2 = local_classifier.score(X, y)
-
-    assert_eq(s1, s2)
+    assert np.isclose(s1, s2)
 
     assert_eq(p1, p2)
     assert_eq(y, p1)
@@ -180,7 +283,49 @@ def test_regressor_local_predict(client, listen_port):
 
     # Predictions and scores should be the same
     assert_eq(p1, p2)
-    assert_eq(s1, s2)
+    assert np.isclose(s1, s2, rtol=1e-4)
+
+
+@pytest.mark.parametrize('output', ['array', 'dataframe'])
+def test_ranker(output, client, listen_port):
+    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=output)
+
+    # Avoid lightgbm.basic.LightGBMError: Binding port 13xxx failed exceptions.
+    time.sleep(10)
+
+    dask_ranker = dlgbm.DaskLGBMRanker(time_out=5, local_listen_port=listen_port, seed=42, min_child_samples=1)
+    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg, client=client)
+    rnkvec_dask = dask_ranker.predict(dX)
+    rnkvec_dask = rnkvec_dask.compute()
+
+    local_ranker = lightgbm.LGBMRanker(seed=42, min_child_samples=1)
+    local_ranker.fit(X, y, sample_weight=w, group=g)
+    rnkvec_local = local_ranker.predict(X)
+
+    # distributed ranker should be able to rank decently well.
+    dcor = spearmanr(rnkvec_dask, y).correlation
+    assert dcor > 0.6
+
+    # relative difference between distributed ranker and local ranker spearman corr should be small.
+    lcor = spearmanr(rnkvec_local, y).correlation
+    assert np.abs(dcor - lcor) / lcor < 0.03
+
+
+@pytest.mark.parametrize('output', ['array', 'dataframe'])
+def test_ranker_local_predict(output, client, listen_port):
+    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=output)
+
+    time.sleep(10)
+
+    dask_ranker = dlgbm.DaskLGBMRanker(time_out=5, local_listen_port=listen_port, seed=42, min_child_samples=1)
+    dask_ranker = dask_ranker.fit(dX, dy, group=dg, client=client)
+    rnkvec_dask = dask_ranker.predict(dX)
+    rnkvec_dask = rnkvec_dask.compute()
+
+    rnkvec_local = dask_ranker.to_local().predict(X)
+
+    # distributed and to-local scores should be the same.
+    assert_eq(rnkvec_dask, rnkvec_local)
 assert_eq(p1, p2) 
 assert_eq(p1, p2) 
 def test_classifier_local_predict(client, listen_port): 
 def test_classifier(output, centers, client, listen_port): 
 assert_eq(p1, p2) 
 assert_eq(p1, p2) 
 def test_classifier_local_predict(client, listen_port): 
 def test_classifier(output, centers, client, listen_port): 
 
 
 def test_build_network_params():