Skip to content

Commit

Permalink
fix address alignment, required by cran (microsoft#3415)
Browse files Browse the repository at this point in the history
* fix dataset binary file alignment

* many fixes

* fix warnings

* fix bug

* Update file_io.cpp

* Update file_io.cpp

* simplify code

* Apply suggestions from code review

* general

* remove unneeded alignment

* Update file_io.h

* int32 to byte8 alignment

* Apply suggestions from code review

* Apply suggestions from code review
  • Loading branch information
guolinke authored and jameslamb committed Sep 30, 2020
1 parent 6c37a57 commit 3aa79b4
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 111 deletions.
16 changes: 9 additions & 7 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ class FeatureGroup {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_multi_val_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper
bin_mappers_.clear();
bin_offsets_.clear();
Expand Down Expand Up @@ -290,9 +290,9 @@ class FeatureGroup {
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_));
writer->Write(&num_feature_, sizeof(num_feature_));
writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer);
}
Expand All @@ -309,7 +309,9 @@ class FeatureGroup {
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
VirtualFileWriter::AlignedSize(sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
Expand Down
19 changes: 19 additions & 0 deletions include/LightGBM/utils/file_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstring>
#include <iostream>
#include <memory>
#include <vector>

namespace LightGBM {

Expand All @@ -31,6 +32,16 @@ struct VirtualFileWriter {
* \return Number of bytes written
*/
virtual size_t Write(const void* data, size_t bytes) const = 0;

size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) const {
auto ret = Write(data, bytes);
if (bytes % alignment != 0) {
size_t padding = AlignedSize(bytes, alignment) - bytes;
std::vector<char> tmp(padding, 0);
ret += Write(tmp.data(), padding);
}
return ret;
}
/*!
* \brief Create appropriate writer for filename
* \param filename Filename of the data
Expand All @@ -43,6 +54,14 @@ struct VirtualFileWriter {
* \return True when the file exists
*/
static bool Exists(const std::string& filename);

static size_t AlignedSize(size_t bytes, size_t alignment = 8) {
if (bytes % alignment == 0) {
return bytes;
} else {
return bytes / alignment * alignment + alignment;
}
}
};

/**
Expand Down
57 changes: 32 additions & 25 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,36 +522,37 @@ namespace LightGBM {

int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(int)));
size +=
static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(MissingType)));
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(bool)));
size += sizeof(double);
size += sizeof(BinType);
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(BinType)));
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t) * 2;
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(uint32_t))) * 2;
return size;
}

void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_));
buffer += sizeof(is_trivial_);
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
Expand All @@ -561,23 +562,23 @@ namespace LightGBM {

void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_));
buffer += sizeof(is_trivial_);
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_);
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
Expand All @@ -592,15 +593,15 @@ namespace LightGBM {
}

void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&num_bin_, sizeof(num_bin_));
writer->Write(&missing_type_, sizeof(missing_type_));
writer->Write(&is_trivial_, sizeof(is_trivial_));
writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
writer->Write(&sparse_rate_, sizeof(sparse_rate_));
writer->Write(&bin_type_, sizeof(bin_type_));
writer->AlignedWrite(&bin_type_, sizeof(bin_type_));
writer->Write(&min_val_, sizeof(min_val_));
writer->Write(&max_val_, sizeof(max_val_));
writer->Write(&default_bin_, sizeof(default_bin_));
writer->Write(&most_freq_bin_, sizeof(most_freq_bin_));
writer->AlignedWrite(&default_bin_, sizeof(default_bin_));
writer->AlignedWrite(&most_freq_bin_, sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_);
} else {
Expand All @@ -609,8 +610,14 @@ namespace LightGBM {
}

size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trivial_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_) + sizeof(most_freq_bin_);
size_t ret = VirtualFileWriter::AlignedSize(sizeof(num_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(missing_type_)) +
VirtualFileWriter::AlignedSize(sizeof(is_trivial_)) +
sizeof(sparse_rate_) +
VirtualFileWriter::AlignedSize(sizeof(bin_type_)) +
sizeof(min_val_) + sizeof(max_val_) +
VirtualFileWriter::AlignedSize(sizeof(default_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
Expand Down
72 changes: 43 additions & 29 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -912,62 +912,76 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token);
writer->Write(binary_file_token, size_of_token);
writer->AlignedWrite(binary_file_token, size_of_token);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_
+ sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
size_t size_of_header =
VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
sizeof(uint64_t) * (num_groups_ + 1) +
2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
VirtualFileWriter::AlignedSize(sizeof(bool)) * 2;

// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
size_of_header +=
VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
// size of forced bins
for (int i = 0; i < num_total_features_; ++i) {
size_of_header +=
forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
writer->Write(&size_of_header, sizeof(size_of_header));
// write header
writer->Write(&num_data_, sizeof(num_data_));
writer->Write(&num_features_, sizeof(num_features_));
writer->Write(&num_total_features_, sizeof(num_total_features_));
writer->Write(&label_idx_, sizeof(label_idx_));
writer->Write(&max_bin_, sizeof(max_bin_));
writer->Write(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->Write(&use_missing_, sizeof(use_missing_));
writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
writer->Write(&num_groups_, sizeof(num_groups_));
writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->Write(feature2group_.data(), sizeof(int) * num_features_);
writer->Write(feature2subfeature_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->AlignedWrite(&num_features_, sizeof(num_features_));
writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
writer->AlignedWrite(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
writer->AlignedWrite(used_feature_map_.data(),
sizeof(int) * num_total_features_);
writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2subfeature_.data(),
sizeof(int) * num_features_);
writer->Write(group_bin_boundaries_.data(),
sizeof(uint64_t) * (num_groups_ + 1));
writer->Write(group_feature_start_.data(), sizeof(int) * num_groups_);
writer->Write(group_feature_cnt_.data(), sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_start_.data(),
sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
if (max_bin_by_feature_.empty()) {
ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
}
writer->Write(max_bin_by_feature_.data(),
writer->AlignedWrite(max_bin_by_feature_.data(),
sizeof(int32_t) * num_total_features_);
if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
max_bin_by_feature_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
writer->Write(&str_len, sizeof(int));
writer->AlignedWrite(&str_len, sizeof(int));
const char* c_str = feature_names_[i].c_str();
writer->Write(c_str, sizeof(char) * str_len);
writer->AlignedWrite(c_str, sizeof(char) * str_len);
}
// write forced bins
for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->Write(&num_bounds, sizeof(int));
writer->AlignedWrite(&num_bounds, sizeof(int));

for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
Expand Down
Loading

0 comments on commit 3aa79b4

Please sign in to comment.