-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Move feature names and types of DMatrix from Python to C++. #5858
Changes from 12 commits
4937f36
2f8640f
3f928fb
dcb7d2b
463587c
9baf895
b133907
01f3ed5
5d2923b
d687456
7e9c42a
80ff55a
590fa55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, | |
const unsigned *array, | ||
bst_ulong len); | ||
|
||
/*! | ||
* \brief Set string encoded information of all features. | ||
* | ||
* Accepted fields are: | ||
* - feature_name | ||
* - feature_type | ||
* | ||
* \param handle An instance of data matrix | ||
* \param field Feild name | ||
* \param features Pointer to array of strings. | ||
* \param size Size of `features` pointer (number of strings passed in). | ||
* | ||
* \return 0 when success, -1 when failure happens | ||
* | ||
* \code | ||
* | ||
* char const* feat_names [] {"feat_0", "feat_1"}; | ||
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2); | ||
* | ||
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized. | ||
* char const* feat_types [] {"i", "q"}; | ||
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2); | ||
* | ||
* \endcode | ||
*/ | ||
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field, | ||
const char **features, | ||
const bst_ulong size); | ||
|
||
/*! | ||
* \brief Get string encoded information of all features. | ||
* | ||
* Accepted fields are: | ||
* - feature_name | ||
* - feature_type | ||
* | ||
* Caller is responsible for copying out the data, before next call to any API function of | ||
* XGBoost. | ||
* | ||
* \param handle An instance of data matrix | ||
* \param field Feild name | ||
* \param size Size of output pointer `features` (number of strings returned). | ||
* \param out_features Address of a pointer to array of strings. Result is stored in | ||
* thread local memory. | ||
* | ||
* \return 0 when success, -1 when failure happens | ||
* | ||
* \code | ||
* | ||
* char const **c_out_features = NULL; | ||
* bst_ulong out_size = 0; | ||
* | ||
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`. | ||
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size, | ||
* &c_out_features) | ||
* | ||
* for (bst_ulong i = 0; i < out_size; ++i) { | ||
* // Here we are simply printing the string. Copy it out if the feature name is | ||
* // useful after printing. | ||
* printf("feature %lu: %s\n", i, c_out_features[i]); | ||
* } | ||
* | ||
* \endcode | ||
*/ | ||
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is |
||
bst_ulong *size, | ||
const char ***out_features); | ||
|
||
/*! | ||
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix | ||
* \param handle a instance of data matrix | ||
|
@@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, | |
* | ||
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or | ||
* linear weights. Striping out parameters configuration like training algorithms or | ||
* CUDA device ID helps user to reuse the trained model for different tasks, examples | ||
* are prediction, training continuation or interpretation. | ||
* CUDA device ID. These functions are designed to let users reuse the trained model | ||
* for different tasks, examples are prediction, training continuation or model | ||
* interpretation. | ||
* | ||
* - Functions with the term "Config" handles save/loading configuration. It helps user | ||
* to study the internal of XGBoost. Also user can use the load method for specifying | ||
|
@@ -592,15 +661,15 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, | |
/*! | ||
* \brief Load model from existing file | ||
* \param handle handle | ||
* \param fname file name | ||
* \param fname File URI or file name. | ||
* \return 0 when success, -1 when failure happens | ||
*/ | ||
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, | ||
const char *fname); | ||
/*! | ||
* \brief Save model into existing file | ||
* \param handle handle | ||
* \param fname file name | ||
* \param fname File URI or file name. | ||
* \return 0 when success, -1 when failure happens | ||
*/ | ||
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,12 @@ enum class DataType : uint8_t { | |
kFloat32 = 1, | ||
kDouble = 2, | ||
kUInt32 = 3, | ||
kUInt64 = 4 | ||
kUInt64 = 4, | ||
kStr = 5 | ||
}; | ||
|
||
enum class FeatureType : uint8_t { | ||
kNumerical | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this PR related to categorical data support? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. The first step. |
||
}; | ||
|
||
/*! | ||
|
@@ -40,7 +45,7 @@ enum class DataType : uint8_t { | |
class MetaInfo { | ||
public: | ||
/*! \brief number of data fields in MetaInfo */ | ||
static constexpr uint64_t kNumField = 9; | ||
static constexpr uint64_t kNumField = 11; | ||
|
||
/*! \brief number of rows in the data */ | ||
uint64_t num_row_{0}; // NOLINT | ||
|
@@ -72,6 +77,19 @@ class MetaInfo { | |
*/ | ||
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT | ||
|
||
/*! | ||
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q" | ||
*/ | ||
std::vector<std::string> feature_type_names; | ||
/*! | ||
* \brief Name for each feature. | ||
*/ | ||
std::vector<std::string> feature_names; | ||
/* | ||
* \brief Type of each feature. Automatically set when feature_type_names is specifed. | ||
*/ | ||
HostDeviceVector<FeatureType> feature_types; | ||
|
||
/*! \brief default constructor */ | ||
MetaInfo() = default; | ||
MetaInfo(MetaInfo&& that) = default; | ||
|
@@ -158,6 +176,12 @@ class MetaInfo { | |
*/ | ||
void SetInfo(const char* key, std::string const& interface_str); | ||
|
||
void GetInfo(char const* key, bst_ulong* out_len, DataType dtype, | ||
const void** out_dptr) const; | ||
|
||
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size); | ||
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const; | ||
|
||
/* | ||
* \brief Extend with other MetaInfo. | ||
* | ||
|
@@ -432,6 +456,8 @@ class BatchSet { | |
BatchIterator<T> begin_iter_; | ||
}; | ||
|
||
struct XGBAPIThreadLocalEntry; | ||
|
||
/*! | ||
* \brief Internal data structured used by XGBoost during training. | ||
*/ | ||
|
@@ -450,6 +476,10 @@ class DMatrix { | |
} | ||
/*! \brief meta information of the dataset */ | ||
virtual const MetaInfo& Info() const = 0; | ||
|
||
/*! \brief Get thread local memory for returning data from DMatrix. */ | ||
XGBAPIThreadLocalEntry& GetThreadLocal() const; | ||
|
||
/** | ||
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches. | ||
*/ | ||
|
@@ -462,7 +492,7 @@ class DMatrix { | |
/*! \return Whether the data columns single column block. */ | ||
virtual bool SingleColBlock() const = 0; | ||
/*! \brief virtual destructor */ | ||
virtual ~DMatrix() = default; | ||
virtual ~DMatrix(); | ||
|
||
/*! \brief Whether the matrix is dense. */ | ||
bool IsDense() const { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is
Str
in the name? Can we call this functionXGDMatrixSetFeatureInfo()
instead?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just trying to be consistent with the other
SetInfo
functions. String is different from other types. If in the future we have something like column weights, we would pass a pointer, instead of pointer to array.