Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix address alignment, required by cran #3415

Merged
merged 14 commits into from
Sep 30, 2020
16 changes: 9 additions & 7 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ class FeatureGroup {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_multi_val_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper
bin_mappers_.clear();
bin_offsets_.clear();
Expand Down Expand Up @@ -301,9 +301,9 @@ class FeatureGroup {
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_));
writer->Write(&num_feature_, sizeof(num_feature_));
writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer);
}
Expand All @@ -320,7 +320,9 @@ class FeatureGroup {
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
VirtualFileWriter::AlignedSize(sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
Expand Down
19 changes: 19 additions & 0 deletions include/LightGBM/utils/file_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstring>
#include <iostream>
#include <memory>
#include <vector>

namespace LightGBM {

Expand All @@ -31,6 +32,16 @@ struct VirtualFileWriter {
* \return Number of bytes written
*/
virtual size_t Write(const void* data, size_t bytes) const = 0;

size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) const {
auto ret = Write(data, bytes);
if (bytes % alignment != 0) {
size_t padding = AlignedSize(bytes, alignment) - bytes;
std::vector<char> tmp(padding, 0);
ret += Write(tmp.data(), padding);
}
return ret;
}
/*!
* \brief Create appropriate writer for filename
* \param filename Filename of the data
Expand All @@ -43,6 +54,14 @@ struct VirtualFileWriter {
* \return True when the file exists
*/
static bool Exists(const std::string& filename);

static size_t AlignedSize(size_t bytes, size_t alignment = 8) {
if (bytes % alignment == 0) {
return bytes;
} else {
return bytes / alignment * alignment + alignment;
}
}
};

/**
Expand Down
57 changes: 32 additions & 25 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,36 +522,37 @@ namespace LightGBM {

int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(int));
guolinke marked this conversation as resolved.
Show resolved Hide resolved
size +=
static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(MissingType)));
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(bool)));
size += sizeof(double);
size += sizeof(BinType);
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(BinType)));
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t) * 2;
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(uint32_t)) * 2;
guolinke marked this conversation as resolved.
Show resolved Hide resolved
return size;
}

void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_));
buffer += sizeof(is_trivial_);
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
Expand All @@ -561,23 +562,23 @@ namespace LightGBM {

void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_));
buffer += sizeof(is_trivial_);
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
Expand All @@ -592,15 +593,15 @@ namespace LightGBM {
}

void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&num_bin_, sizeof(num_bin_));
writer->Write(&missing_type_, sizeof(missing_type_));
writer->Write(&is_trivial_, sizeof(is_trivial_));
writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
writer->Write(&sparse_rate_, sizeof(sparse_rate_));
writer->Write(&bin_type_, sizeof(bin_type_));
writer->AlignedWrite(&bin_type_, sizeof(bin_type_));
writer->Write(&min_val_, sizeof(min_val_));
writer->Write(&max_val_, sizeof(max_val_));
writer->Write(&default_bin_, sizeof(default_bin_));
writer->Write(&most_freq_bin_, sizeof(most_freq_bin_));
writer->AlignedWrite(&default_bin_, sizeof(default_bin_));
writer->AlignedWrite(&most_freq_bin_, sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_);
} else {
Expand All @@ -609,8 +610,14 @@ namespace LightGBM {
}

size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trivial_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_) + sizeof(most_freq_bin_);
size_t ret = VirtualFileWriter::AlignedSize(sizeof(num_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(missing_type_)) +
VirtualFileWriter::AlignedSize(sizeof(is_trivial_)) +
sizeof(sparse_rate_) +
VirtualFileWriter::AlignedSize(sizeof(bin_type_)) +
sizeof(min_val_) + sizeof(max_val_) +
VirtualFileWriter::AlignedSize(sizeof(default_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
Expand Down
72 changes: 43 additions & 29 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -924,62 +924,76 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token);
writer->Write(binary_file_token, size_of_token);
writer->AlignedWrite(binary_file_token, size_of_token);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_
+ sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
size_t size_of_header =
VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
sizeof(uint64_t) * (num_groups_ + 1) +
2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
VirtualFileWriter::AlignedSize(sizeof(bool)) * 2;

// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
size_of_header +=
VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
// size of forced bins
for (int i = 0; i < num_total_features_; ++i) {
size_of_header +=
forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
writer->Write(&size_of_header, sizeof(size_of_header));
// write header
writer->Write(&num_data_, sizeof(num_data_));
writer->Write(&num_features_, sizeof(num_features_));
writer->Write(&num_total_features_, sizeof(num_total_features_));
writer->Write(&label_idx_, sizeof(label_idx_));
writer->Write(&max_bin_, sizeof(max_bin_));
writer->Write(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->Write(&use_missing_, sizeof(use_missing_));
writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
writer->Write(&num_groups_, sizeof(num_groups_));
writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->Write(feature2group_.data(), sizeof(int) * num_features_);
writer->Write(feature2subfeature_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->AlignedWrite(&num_features_, sizeof(num_features_));
writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
writer->AlignedWrite(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
writer->AlignedWrite(used_feature_map_.data(),
sizeof(int) * num_total_features_);
writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2subfeature_.data(),
sizeof(int) * num_features_);
writer->Write(group_bin_boundaries_.data(),
sizeof(uint64_t) * (num_groups_ + 1));
writer->Write(group_feature_start_.data(), sizeof(int) * num_groups_);
writer->Write(group_feature_cnt_.data(), sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_start_.data(),
sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
if (max_bin_by_feature_.empty()) {
ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
}
writer->Write(max_bin_by_feature_.data(),
writer->AlignedWrite(max_bin_by_feature_.data(),
sizeof(int32_t) * num_total_features_);
if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
max_bin_by_feature_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
writer->Write(&str_len, sizeof(int));
writer->AlignedWrite(&str_len, sizeof(int));
const char* c_str = feature_names_[i].c_str();
writer->Write(c_str, sizeof(char) * str_len);
writer->AlignedWrite(c_str, sizeof(char) * str_len);
}
// write forced bins
for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->Write(&num_bounds, sizeof(int));
writer->AlignedWrite(&num_bounds, sizeof(int));

for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
Expand Down
Loading