Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dask prediction. #4941

Merged
merged 8 commits into from
Oct 15, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions demo/dask/cpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@

def main(client):
# generate some random data for demonstration
n = 100
m = 100000
partition_size = 1000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
n = 100
X = da.random.random(size=(m, n), chunks=100)
y = da.random.random(size=(m, ), chunks=100)

# DaskDMatrix acts like normal DMatrix, works as a proxy for local
# DMatrix scatter around workers.
Expand All @@ -38,6 +37,6 @@ def main(client):

if __name__ == '__main__':
# or use other clusters for scaling
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
with LocalCluster(n_workers=7, threads_per_worker=1) as cluster:
with Client(cluster) as client:
main(client)
9 changes: 5 additions & 4 deletions demo/dask/gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@


def main(client):
n = 100
# generate some random data for demonstration
m = 100000
partition_size = 1000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
n = 100
X = da.random.random(size=(m, n), chunks=100)
y = da.random.random(size=(m, ), chunks=100)

# DaskDMatrix acts like normal DMatrix, works as a proxy for local
# DMatrix scatter around workers.
Expand All @@ -23,6 +23,7 @@ def main(client):
output = xgb.dask.train(client,
{'verbosity': 2,
'nthread': 1,
# Golden line for GPU training
'tree_method': 'gpu_hist'},
dtrain,
num_boost_round=4, evals=[(dtrain, 'train')])
Expand Down
20 changes: 16 additions & 4 deletions python-package/xgboost/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,15 @@ def __init__(self,

async def map_local_data(self, client, data, label=None, weights=None):
'''Obtain references to local data.'''

def inconsistent(left, left_name, right, right_name):
msg = 'Partitions between {a_name} and {b_name} are not ' \
'consistent: {a_len} != {b_len}'.format(
a_name=left_name, b_name=right_name, a_len=len(left),
b_len=len(right)
)
return msg

data = data.persist()
if label is not None:
label = label.persist()
Expand All @@ -169,7 +178,7 @@ async def map_local_data(self, client, data, label=None, weights=None):
# equivalents.
X_parts = data.to_delayed()
if isinstance(X_parts, numpy.ndarray):
assert X_parts.shape[1] == 1
assert X_parts.shape[1] == 1, X_parts.shape[1]
X_parts = X_parts.flatten().tolist()

if label is not None:
Expand All @@ -186,11 +195,11 @@ async def map_local_data(self, client, data, label=None, weights=None):
parts = [X_parts]
if label is not None:
assert len(X_parts) == len(
y_parts), 'Partitions between X and y are not consistent'
y_parts), inconsistent(X_parts, 'X', y_parts, 'labels')
parts.append(y_parts)
if weights is not None:
assert len(X_parts) == len(
w_parts), 'Partitions between X and weight are not consistent.'
w_parts), inconsistent(X_parts, 'X', w_parts, 'weights')
parts.append(w_parts)
parts = list(map(delayed, zip(*parts)))

Expand Down Expand Up @@ -275,7 +284,10 @@ def get_worker_data_shape(self, worker):
cols = 0
for shape in shapes:
rows += shape[0]
cols += shape[1]

c = shape[1]
assert cols in (0, c)
cols = c
return (rows, cols)


Expand Down
11 changes: 8 additions & 3 deletions tests/python/test_with_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
pass

kRows = 1000
kCols = 10


def generate_array():
n = 10
partition_size = 20
X = da.random.random((kRows, n), partition_size)
X = da.random.random((kRows, kCols), partition_size)
y = da.random.random(kRows, partition_size)
return X, y

Expand All @@ -44,7 +44,7 @@ def test_from_dask_dataframe(client):
prediction = xgb.dask.predict(client, model=booster, data=dtrain)

assert isinstance(prediction, da.Array)
assert prediction.shape[0] == kRows, prediction
assert prediction.shape[0] == kRows and prediction.shape[1] == kCols

with pytest.raises(ValueError):
# evals_result is not supported in dask interface.
Expand All @@ -59,6 +59,7 @@ def test_from_dask_array(client):
result = xgb.dask.train(client, {}, dtrain)

prediction = xgb.dask.predict(client, result, dtrain)
assert prediction.shape[0] == kRows and prediction.shape[1] == kCols

assert isinstance(prediction, da.Array)

Expand All @@ -71,6 +72,8 @@ def test_regressor(client):
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)

assert prediction.shape[0] == kRows and prediction.shape[1] == kCols

history = regressor.evals_result()

assert isinstance(prediction, da.Array)
Expand All @@ -88,6 +91,8 @@ def test_classifier(client):
classifier.fit(X, y, eval_set=[(X, y)])
prediction = classifier.predict(X)

assert prediction.shape[0] == kRows and prediction.shape[1] == kCols

history = classifier.evals_result()

assert isinstance(prediction, da.Array)
Expand Down