Skip to content

Commit

Permalink
Simplify the data backends. (#5893)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Jul 16, 2020
1 parent 7aee0e5 commit 029a8b5
Show file tree
Hide file tree
Showing 8 changed files with 793 additions and 809 deletions.
210 changes: 70 additions & 140 deletions python-package/xgboost/core.py

Large diffs are not rendered by default.

1,313 changes: 682 additions & 631 deletions python-package/xgboost/data.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/data/proxy_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ void DMatrixProxy::FromCudaColumnar(std::string interface_str) {
auto const& value = adapter->Value();
this->batch_ = adapter;
device_ = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
}

void DMatrixProxy::FromCudaArray(std::string interface_str) {
std::shared_ptr<CupyAdapter> adapter(new CupyAdapter(interface_str));
this->batch_ = adapter;
device_ = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
}

} // namespace data
Expand Down
7 changes: 4 additions & 3 deletions tests/python-gpu/test_device_quantile_dmatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
class TestDeviceQuantileDMatrix(unittest.TestCase):
def test_dmatrix_numpy_init(self):
data = np.random.randn(5, 5)
with pytest.raises(AssertionError, match='is not supported for DeviceQuantileDMatrix'):
dm = xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))
with pytest.raises(TypeError,
match='is not supported for DeviceQuantileDMatrix'):
xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))

@pytest.mark.skipif(**tm.no_cupy())
def test_dmatrix_cupy_init(self):
import cupy as cp
data = cp.random.randn(5, 5)
dm = xgb.DeviceQuantileDMatrix(data, cp.ones(5, dtype=np.float64))
xgb.DeviceQuantileDMatrix(data, cp.ones(5, dtype=np.float64))
16 changes: 8 additions & 8 deletions tests/python-gpu/test_from_cudf.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ def _test_cudf_metainfo(DMatrixT):
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cudf.set_interface_info('weight', cudf_floats)
dmat_cudf.set_interface_info('label', cudf_floats)
dmat_cudf.set_interface_info('base_margin', cudf_floats)
dmat_cudf.set_interface_info('group', cudf_uints)
dmat_cudf.set_info(weight=cudf_floats)
dmat_cudf.set_info(label=cudf_floats)
dmat_cudf.set_info(base_margin=cudf_floats)
dmat_cudf.set_info(group=cudf_uints)

# Test setting info with cudf DataFrame
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
Expand All @@ -132,10 +132,10 @@ def _test_cudf_metainfo(DMatrixT):
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))

# Test setting info with cudf Series
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
dmat_cudf.set_info(weight=cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_info(label=cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_info(base_margin=cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_info(group=cudf_uints[cudf_uints.columns[0]])
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
Expand Down
8 changes: 4 additions & 4 deletions tests/python-gpu/test_from_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ def _test_cupy_metainfo(DMatrixT):
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cupy.set_interface_info('weight', cupy_floats)
dmat_cupy.set_interface_info('label', cupy_floats)
dmat_cupy.set_interface_info('base_margin', cupy_floats)
dmat_cupy.set_interface_info('group', cupy_uints)
dmat_cupy.set_info(weight=cupy_floats)
dmat_cupy.set_info(label=cupy_floats)
dmat_cupy.set_info(base_margin=cupy_floats)
dmat_cupy.set_info(group=cupy_uints)

# Test setting info with cupy
assert np.array_equal(dmat.get_float_info('weight'),
Expand Down
32 changes: 16 additions & 16 deletions tests/python/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
# -*- coding: utf-8 -*-
import sys
from contextlib import contextmanager
try:
# python 2
from StringIO import StringIO
except ImportError:
# python 3
from io import StringIO
from io import StringIO
import numpy as np
import os
import xgboost as xgb
import unittest
import json
from pathlib import Path
import tempfile

dpath = 'demo/data/'
rng = np.random.RandomState(1994)
Expand Down Expand Up @@ -66,16 +63,19 @@ def test_basic(self):
# error must be smaller than 10%
assert err < 0.1

# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
with tempfile.TemporaryDirectory() as tmpdir:
dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
# save dmatrix into binary buffer
dtest.save_binary(dtest_path)
# save model
model_path = os.path.join(tmpdir, 'model.booster')
bst.save_model(model_path)
# load model and data in
bst2 = xgb.Booster(model_file=model_path)
dtest2 = xgb.DMatrix(dtest_path)
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0

def test_record_results(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
Expand Down
12 changes: 5 additions & 7 deletions tests/python/test_with_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ def test_pandas(self):
# 0 1 1 0 0
# 1 2 0 1 0
# 2 3 0 0 1
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
result, _, _ = pandas_handler._maybe_pandas_data(dummies, None, None)
result, _, _ = xgb.data._transform_pandas_df(dummies)
exp = np.array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 0., 0., 1.]])
Expand Down Expand Up @@ -129,18 +128,17 @@ def test_pandas_sparse(self):
def test_pandas_label(self):
# label must be a single column
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')

# label must be supported dtype
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')

df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
result, _, _ = pandas_handler._maybe_pandas_data(df, None, None,
'label', 'float')
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
'label', 'float')
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
dtype=float))
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
Expand Down

0 comments on commit 029a8b5

Please sign in to comment.