Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move feature names and types of DMatrix from Python to C++. #5858

Merged
merged 13 commits into from
Jul 7, 2020
Merged
Next Next commit
Move feature names and types of DMatrix into C++.
  • Loading branch information
trivialfis committed Jul 5, 2020
commit 4937f36fd276ba78f184d70b108a9d180d78f7aa
28 changes: 26 additions & 2 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ enum class DataType : uint8_t {
kFloat32 = 1,
kDouble = 2,
kUInt32 = 3,
kUInt64 = 4
kUInt64 = 4,
kChar = 5
};

enum class FeatureType : uint8_t {
kNumerical
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this PR related to categorical data support?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. The first step.

};

/*!
Expand All @@ -40,7 +45,7 @@ enum class DataType : uint8_t {
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 9;
static constexpr uint64_t kNumField = 11;

/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
Expand Down Expand Up @@ -72,6 +77,19 @@ class MetaInfo {
*/
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT

/*!
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
*/
std::vector<std::string> feature_type_names;
/*!
* \brief Name for each feature.
*/
std::vector<std::string> feature_names;
/*
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
*/
HostDeviceVector<FeatureType> feature_types;

/*! \brief default constructor */
MetaInfo() = default;
MetaInfo(MetaInfo&& that) = default;
Expand Down Expand Up @@ -158,6 +176,12 @@ class MetaInfo {
*/
void SetInfo(const char* key, std::string const& interface_str);

void GetInfo(char const* key, bst_row_t* out_len, DataType dtype,
const void** out_dptr) const;

void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;

/*
* \brief Extend with other MetaInfo.
*
Expand Down
111 changes: 69 additions & 42 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes

DMatrix is a internal data structure that used by XGBoost
which is optimized for both memory efficiency and training speed.
You can construct DMatrix from numpy.arrays
You can construct DMatrix from multiple different sources of data.
"""

_feature_names = None # for previous version's pickle
_feature_types = None

def __init__(self, data, label=None, weight=None, base_margin=None,
missing=None,
silent=False,
Expand Down Expand Up @@ -362,11 +359,6 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None

if feature_names is not None:
self._feature_names = feature_names
if feature_types is not None:
self._feature_types = feature_types
return

handler = self.get_data_handler(data)
Expand Down Expand Up @@ -666,14 +658,16 @@ def slice(self, rindex, allow_groups=False):
res : DMatrix
A new DMatrix containing only selected indices.
"""
res = DMatrix(None, feature_names=self.feature_names,
feature_types=self.feature_types)
res = DMatrix(None)
res.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.c_int(1 if allow_groups else 0)))
_check_call(_LIB.XGDMatrixSliceDMatrixEx(
self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.c_int(1 if allow_groups else 0)))
res.feature_names = self.feature_names
res.feature_types = self.feature_types
return res

@property
Expand All @@ -684,20 +678,17 @@ def feature_names(self):
-------
feature_names : list or None
"""
if self._feature_names is None:
self._feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return self._feature_names

@property
def feature_types(self):
"""Get feature types (column types).

Returns
-------
feature_types : list or None
"""
return self._feature_types
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_name'),
ctypes.byref(length),
ctypes.byref(sarr)))
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return feature_names

@feature_names.setter
def feature_names(self, feature_names):
Expand Down Expand Up @@ -728,10 +719,41 @@ def feature_names(self, feature_names):
not any(x in f for x in set(('[', ']', '<')))
for f in feature_names):
raise ValueError('feature_names must be string, and may not contain [, ] or <')
c_feature_names = [bytes(f, encoding='utf-8')
for f in feature_names]
c_feature_names = (ctypes.c_char_p *
len(c_feature_names))(*c_feature_names)
_check_call(_LIB.XGDMatrixSetStrUFeatureInfo(
self.handle, c_str('feature_name'),
c_feature_names,
c_bst_ulong(len(feature_names))))
else:
# reset feature_types also
_check_call(_LIB.XGDMatrixSetStrUFeatureInfo(
self.handle,
c_str('feature_name'),
None,
c_bst_ulong(0)))
self.feature_types = None
self._feature_names = feature_names

@property
def feature_types(self):
"""Get feature types (column types).

Returns
-------
feature_types : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_type'),
ctypes.byref(length),
ctypes.byref(sarr)))
res = from_cstr_to_pystr(sarr, length)
if not res:
return None
return res

@feature_types.setter
def feature_types(self, feature_types):
Expand All @@ -746,31 +768,36 @@ def feature_types(self, feature_types):
Labels for features. None will reset existing feature names
"""
if feature_types is not None:
if self._feature_names is None:
msg = 'Unable to set feature types before setting names'
raise ValueError(msg)

assert isinstance(feature_types, (list, str)), feature_types
if isinstance(feature_types, STRING_TYPES):
# single string will be applied to all columns
feature_types = [feature_types] * self.num_col()

try:
if not isinstance(feature_types, str):
feature_types = list(feature_types)
else:
feature_types = [feature_types]
except TypeError:
feature_types = [feature_types]
c_feature_types = [bytes(f, encoding='utf-8')
for f in feature_types]
c_feature_types = (ctypes.c_char_p *
len(c_feature_types))(*c_feature_types)
_check_call(_LIB.XGDMatrixSetStrUFeatureInfo(
self.handle, c_str('feature_type'),
c_feature_types,
c_bst_ulong(len(feature_types))))

if len(feature_types) != self.num_col():
msg = 'feature_types must have the same length as data'
raise ValueError(msg)

valid = ('int', 'float', 'i', 'q')
if not all(isinstance(f, STRING_TYPES) and f in valid
for f in feature_types):
raise ValueError('All feature_names must be {int, float, i, q}')
self._feature_types = feature_types
else:
# Reset.
_check_call(_LIB.XGDMatrixSetStrUFeatureInfo(
self.handle,
c_str('feature_type'),
None,
c_bst_ulong(0)))


class DeviceQuantileDMatrix(DMatrix):
Expand Down
65 changes: 41 additions & 24 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,45 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
API_END();
}

XGB_DLL int XGDMatrixSetStrUFeatureInfo(DMatrixHandle handle, const char *field,
const char **c_info,
const xgboost::bst_ulong size) {
API_BEGIN();
CHECK_HANDLE();
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
info.SetFeatureInfo(field, c_info, size);
API_END();
}

using DMatrixThreadLocal =
dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;

XGBAPIThreadLocalEntry& GetDMatrixThreadLocal(std::shared_ptr<DMatrix> m) {
return (*DMatrixThreadLocal::Get())[m.get()];
}

XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
xgboost::bst_ulong *len,
const char ***out_features) {
API_BEGIN();
CHECK_HANDLE();
auto m = *static_cast<std::shared_ptr<DMatrix>*>(handle);
auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();

std::vector<const char *> &charp_vecs = GetDMatrixThreadLocal(m).ret_vec_charp;
std::vector<std::string> &str_vecs = GetDMatrixThreadLocal(m).ret_vec_str;

info.GetFeatureInfo(field, &str_vecs);

charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str();
}
*out_features = dmlc::BeginPtr(charp_vecs);
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
API_END();
}

XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
const unsigned* group,
xgboost::bst_ulong len) {
Expand All @@ -301,22 +340,7 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
API_BEGIN();
CHECK_HANDLE();
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
const std::vector<bst_float>* vec = nullptr;
if (!std::strcmp(field, "label")) {
vec = &info.labels_.HostVector();
} else if (!std::strcmp(field, "weight")) {
vec = &info.weights_.HostVector();
} else if (!std::strcmp(field, "base_margin")) {
vec = &info.base_margin_.HostVector();
} else if (!std::strcmp(field, "label_lower_bound")) {
vec = &info.labels_lower_bound_.HostVector();
} else if (!std::strcmp(field, "label_upper_bound")) {
vec = &info.labels_upper_bound_.HostVector();
} else {
LOG(FATAL) << "Unknown float field name " << field;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
*out_dptr = dmlc::BeginPtr(*vec);
info.GetInfo(field, out_len, DataType::kFloat32, reinterpret_cast<void const**>(out_dptr));
API_END();
}

Expand All @@ -327,14 +351,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
API_BEGIN();
CHECK_HANDLE();
const MetaInfo& info = static_cast<std::shared_ptr<DMatrix>*>(handle)->get()->Info();
const std::vector<unsigned>* vec = nullptr;
if (!std::strcmp(field, "group_ptr")) {
vec = &info.group_ptr_;
} else {
LOG(FATAL) << "Unknown uint field name " << field;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
*out_dptr = dmlc::BeginPtr(*vec);
info.GetInfo(field, out_len, DataType::kUInt32, reinterpret_cast<void const**>(out_dptr));
API_END();
}

Expand Down
2 changes: 2 additions & 0 deletions src/common/host_device_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ void HostDeviceVector<T>::SetDevice(int device) const {}
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t
Expand Down
1 change: 1 addition & 0 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t
Expand Down
Loading