Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Target and Count encodings for categorical features - Iteration 1 #4964

Closed
wants to merge 22 commits into from
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ file(
src/objective/*.cpp
src/network/*.cpp
src/treelearner/*.cpp
src/feature_engineering/*.cpp
if(USE_CUDA)
src/treelearner/*.cu
endif()
Expand Down
53 changes: 53 additions & 0 deletions src/feature_engineering/category_feature_count_encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/

#include "category_feature_encoder.hpp"

// property name keys
const char count_information_key[] = "count_information";
const char category_key[] = "cat";
const char value_key[] = "value";

namespace LightGBM {
double CategoryFeatureCountEncoder::Encode(double feature_value) {
int category = static_cast<int>(feature_value);

if (count_information_.find(category) != count_information_.end()) {
return count_information_[category];
}

return default_value;
}

json11::Json::object CategoryFeatureCountEncoder::DumpToJsonObject() {
json11::Json::object result = CategoryFeatureEncoder::DumpToJsonObject();

json11::Json::array count_information_json;
for (const auto& count_pair : count_information_) {
count_information_json.emplace_back(
json11::Json::object{
{category_key, json11::Json(count_pair.first)},
{value_key, json11::Json(count_pair.second)},
});
}
result[count_information_key] = json11::Json(count_information_json);

return result;
}

std::unique_ptr<CategoryFeatureEncoder> CategoryFeatureCountEncoder::RecoverFromModelStringInJsonFormat(json11::Json input) {
std::unordered_map<int, int> count_information;

std::vector<Json> count_information_json = input[count_information_key].array_items();
for (Json entry : count_information_json) {
int count_information_category = entry[category_key].int_value();
int count_information_value = entry[value_key].int_value();

count_information[count_information_category] = count_information_value;
}

return std::unique_ptr<CategoryFeatureEncoder>(new CategoryFeatureCountEncoder(count_information));
}
} // namespace LightGBM
43 changes: 43 additions & 0 deletions src/feature_engineering/category_feature_encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/

#include "category_feature_encoder.hpp"
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/json11.h>

#include <memory>
#include <utility>

// property name keys
const char feature_name_key[] = "feature_name";
const char encoder_type_key[] = "encoder_type";

namespace LightGBM {
json11::Json::object CategoryFeatureEncoder::DumpToJsonObject() {
json11::Json::object result {
{ encoder_type_key, json11::Json(type_) },
{ feature_name_key, json11::Json(feature_name_) },
};

return result;
}

std::unique_ptr<CategoryFeatureEncoder> CategoryFeatureEncoder::RecoverFromModelStringInJsonFormat(json11::Json input) {
int type = input[encoder_type_key].int_value();
std::string feature_name = input[feature_name_key].string_value();
std::unique_ptr<CategoryFeatureEncoder> result;

if (type == CategoryFeatureCountEncoder::count_encoder_type) {
result = std::move(CategoryFeatureCountEncoder::RecoverFromModelStringInJsonFormat(input));
} else if (type == CategoryFeatureTargetEncoder::target_encoder_type) {
result = std::move(CategoryFeatureTargetEncoder::RecoverFromModelStringInJsonFormat(input));
} else {
Log::Fatal("Unknown encoder type %d", type);
}

result->feature_name_ = feature_name;
return result;
}
} // namespace LightGBM
184 changes: 184 additions & 0 deletions src/feature_engineering/category_feature_encoder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_ENCODER_HPP_
#define LIGHTGBM_ENCODER_HPP_

#include <LightGBM/utils/json11.h>
#include <LightGBM/meta.h>

#include <string>
#include <memory>
#include <utility>
#include <unordered_map>
#include <vector>

namespace LightGBM {

using json11::Json;

class CategoryFeatureEncoder {
public:
explicit CategoryFeatureEncoder(int type) : type_(type) {}

CategoryFeatureEncoder(const std::string feature_name, int type) : feature_name_(feature_name), type_(type) {}

inline std::string GetFeatureName() {
return feature_name_;
}

inline int GetTypeId() {
return type_;
}

virtual double Encode(double feature_value) = 0;

virtual json11::Json::object DumpToJsonObject() = 0;

static std::unique_ptr<CategoryFeatureEncoder> RecoverFromModelStringInJsonFormat(json11::Json input);

protected:
std::string feature_name_;
int type_;
};

class CategoryFeatureCountEncoder : public CategoryFeatureEncoder {
public:
explicit CategoryFeatureCountEncoder(std::unordered_map<int, int> count_information) : CategoryFeatureEncoder(count_encoder_type), count_information_(count_information) {}

CategoryFeatureCountEncoder(std::string feature_name, std::unordered_map<int, int> count_information) : CategoryFeatureEncoder(feature_name, count_encoder_type), count_information_(count_information) {}

double Encode(double feature_value) override;

json11::Json::object DumpToJsonObject() override;

static std::unique_ptr<CategoryFeatureEncoder> RecoverFromModelStringInJsonFormat(json11::Json input);

// public constant value
static const int count_encoder_type = 1;

private:
std::unordered_map<int, int> count_information_;

// constant value
const double default_value = 0.0;
};

class CategoryFeatureTargetEncoder : public CategoryFeatureEncoder {
public:
CategoryFeatureTargetEncoder(double prior, double prior_weight, std::unordered_map<int, int> count_information, std::unordered_map<int, double> label_information)
: CategoryFeatureEncoder(target_encoder_type), prior_(prior), prior_weight_(prior_weight), count_information_(count_information), label_information_(label_information) {}

CategoryFeatureTargetEncoder(std::string feature_name, double prior, double prior_weight, std::unordered_map<int, int> count_information, std::unordered_map<int, double> label_information)
: CategoryFeatureEncoder(feature_name, target_encoder_type), prior_(prior), prior_weight_(prior_weight), count_information_(count_information), label_information_(label_information) {}

double Encode(double feature_value) override;

json11::Json::object DumpToJsonObject() override;

static std::unique_ptr<CategoryFeatureEncoder> RecoverFromModelStringInJsonFormat(json11::Json input);

// public constant value
static const int target_encoder_type = 2;

private:
double prior_;
double prior_weight_;
std::unordered_map<int, int> count_information_;
std::unordered_map<int, double> label_information_;

// constant value
const double default_value = 0.0;
};

struct CategoryFeatureTargetInformation {
// <category_id, category_total_count>
std::unordered_map<int, int> category_count;

// <category_id, label_sum>
std::unordered_map<int, double> category_label_sum;

int total_count;

double label_sum;
};

class CategoryFeatureTargetInformationCollector {
public:
CategoryFeatureTargetInformationCollector(std::vector<int> categorical_features, int fold_count) : count_(fold_count), label_sum_(fold_count), category_target_information_(fold_count) {
categorical_features_ = categorical_features;
}

void HandleRecord(int fold_id, const std::vector<double>& record, double label);

void AppendFrom(const CategoryFeatureTargetInformationCollector& collector);

const std::vector<std::unordered_map<int, CategoryFeatureTargetInformation>>& GetCategoryTargetInformation() const {
return category_target_information_;
}

const std::vector<int>& GetCategoricalFeatures() const {
return categorical_features_;
}

const std::vector<data_size_t>& GetCounts() const {
return count_;
}

const std::vector<double>& GetLabelSum() const {
return label_sum_;
}

const std::unordered_map<int, CategoryFeatureTargetInformation>& GetGlobalCategoryTargetInformation() const {
return global_category_target_information_;
}

private:
std::vector<int> categorical_features_;

// <fold_id, row_count>
std::vector<data_size_t> count_;

// <fold_id, label_sum>
std::vector<double> label_sum_;

// <fold_id, <feature_id, CategoryFeatureTargetInformation>>
std::vector<std::unordered_map<int, CategoryFeatureTargetInformation>> category_target_information_;

// <feature_id, CategoryFeatureTargetInformation>
std::unordered_map<int, CategoryFeatureTargetInformation> global_category_target_information_;
};

struct EncodeResult {
double value;
std::string feature_name;
};

class CategoryFeatureEncoderManager {
public:
// NOLINTNEXTLINE
CategoryFeatureEncoderManager(std::vector<std::unordered_map<int, std::vector<std::unique_ptr<CategoryFeatureEncoder>>>>& train_category_feature_encoders, std::unordered_map<int, std::vector<std::unique_ptr<CategoryFeatureEncoder>>>& category_feature_encoders)
: train_category_feature_encoders_(std::move(train_category_feature_encoders)), category_feature_encoders_(std::move(category_feature_encoders)) { }

std::vector<EncodeResult> Encode(int fold_id, int feature_id, double feature_value);

std::vector<EncodeResult> Encode(int feature_id, double feature_value);

std::string DumpToModelStringInJsonFormat();

static std::unique_ptr<CategoryFeatureEncoderManager> RecoverFromModelStringInJsonFormat(std::string input);

static std::unique_ptr<CategoryFeatureEncoderManager> Create(json11::Json settings, const CategoryFeatureTargetInformationCollector& informationCollector);

private:
// <fold_id, <feature_id, Encoders>>
std::vector<std::unordered_map<int, std::vector<std::unique_ptr<CategoryFeatureEncoder>>>> train_category_feature_encoders_;

// <feature_id, Encoders>
std::unordered_map<int, std::vector<std::unique_ptr<CategoryFeatureEncoder>>> category_feature_encoders_;
};

} // namespace LightGBM

#endif
Loading