dmlc · trivialfis · Oct 15, 2019 · Oct 13, 2019 · Oct 13, 2019 · Oct 13, 2019
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
@@ -7,11 +7,10 @@
 
 def main(client):
     # generate some random data for demonstration
-    n = 100
     m = 100000
-    partition_size = 1000
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
+    n = 100
+    X = da.random.random(size=(m, n), chunks=100)
+    y = da.random.random(size=(m, ), chunks=100)
 
     # DaskDMatrix acts like normal DMatrix, works as a proxy for local
     # DMatrix scatter around workers.
@@ -38,6 +37,6 @@ def main(client):
 
 if __name__ == '__main__':
     # or use other clusters for scaling
-    with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
+    with LocalCluster(n_workers=7, threads_per_worker=1) as cluster:
         with Client(cluster) as client:
             main(client)
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
@@ -6,11 +6,11 @@
 
 
 def main(client):
-    n = 100
+    # generate some random data for demonstration
     m = 100000
-    partition_size = 1000
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
+    n = 100
+    X = da.random.random(size=(m, n), chunks=100)
+    y = da.random.random(size=(m, ), chunks=100)
 
     # DaskDMatrix acts like normal DMatrix, works as a proxy for local
     # DMatrix scatter around workers.
@@ -23,6 +23,7 @@ def main(client):
     output = xgb.dask.train(client,
                             {'verbosity': 2,
                              'nthread': 1,
+                             # Golden line for GPU training
                              'tree_method': 'gpu_hist'},
                             dtrain,
                             num_boost_round=4, evals=[(dtrain, 'train')])

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
@@ -158,6 +158,15 @@ def __init__(self,
 
     async def map_local_data(self, client, data, label=None, weights=None):
         '''Obtain references to local data.'''
+
+        def inconsistent(left, left_name, right, right_name):
+            msg = 'Partitions between {a_name} and {b_name} are not ' \
+                'consistent: {a_len} != {b_len}'.format(
+                    a_name=left_name, b_name=right_name, a_len=len(left),
+                    b_len=len(right)
+                )
+            return msg
+
         data = data.persist()
         if label is not None:
             label = label.persist()
@@ -169,7 +178,7 @@ async def map_local_data(self, client, data, label=None, weights=None):
         # equivalents.
         X_parts = data.to_delayed()
         if isinstance(X_parts, numpy.ndarray):
-            assert X_parts.shape[1] == 1
+            assert X_parts.shape[1] == 1, X_parts.shape[1]
             X_parts = X_parts.flatten().tolist()
 
         if label is not None:
@@ -186,11 +195,11 @@ async def map_local_data(self, client, data, label=None, weights=None):
         parts = [X_parts]
         if label is not None:
             assert len(X_parts) == len(
-                y_parts), 'Partitions between X and y are not consistent'
+                y_parts), inconsistent(X_parts, 'X', y_parts, 'labels')
             parts.append(y_parts)
         if weights is not None:
             assert len(X_parts) == len(
-                w_parts), 'Partitions between X and weight are not consistent.'
+                w_parts), inconsistent(X_parts, 'X', w_parts, 'weights')
             parts.append(w_parts)
         parts = list(map(delayed, zip(*parts)))
 
@@ -275,7 +284,10 @@ def get_worker_data_shape(self, worker):
         cols = 0
         for shape in shapes:
             rows += shape[0]
-            cols += shape[1]
+
+            c = shape[1]
+            assert cols in (0, c)
+            cols = c
         return (rows, cols)
 
 

diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
@@ -21,12 +21,12 @@
     pass
 
 kRows = 1000
+kCols = 10
 
 
 def generate_array():
-    n = 10
     partition_size = 20
-    X = da.random.random((kRows, n), partition_size)
+    X = da.random.random((kRows, kCols), partition_size)
     y = da.random.random(kRows, partition_size)
     return X, y
 
@@ -44,7 +44,7 @@ def test_from_dask_dataframe(client):
     prediction = xgb.dask.predict(client, model=booster, data=dtrain)
 
     assert isinstance(prediction, da.Array)
-    assert prediction.shape[0] == kRows, prediction
+    assert prediction.shape[0] == kRows and prediction.shape[1] == kCols
 
     with pytest.raises(ValueError):
         # evals_result is not supported in dask interface.
@@ -59,6 +59,7 @@ def test_from_dask_array(client):
     result = xgb.dask.train(client, {}, dtrain)
 
     prediction = xgb.dask.predict(client, result, dtrain)
+    assert prediction.shape[0] == kRows and prediction.shape[1] == kCols
 
     assert isinstance(prediction, da.Array)
 
@@ -71,6 +72,8 @@ def test_regressor(client):
     regressor.fit(X, y, eval_set=[(X, y)])
     prediction = regressor.predict(X)
 
+    assert prediction.shape[0] == kRows and prediction.shape[1] == kCols
+
     history = regressor.evals_result()
 
     assert isinstance(prediction, da.Array)
@@ -88,6 +91,8 @@ def test_classifier(client):
     classifier.fit(X, y,  eval_set=[(X, y)])
     prediction = classifier.predict(X)
 
+    assert prediction.shape[0] == kRows and prediction.shape[1] == kCols
+
     history = classifier.evals_result()
 
     assert isinstance(prediction, da.Array)