Skip to content

Commit

Permalink
Ensure that LoadSequentialFile() actually read the whole file (#5831)
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 authored Jul 4, 2020
1 parent 1a08012 commit efe3e48
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 35 deletions.
4 changes: 3 additions & 1 deletion include/xgboost/json_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
#include <vector>
#include <memory>
#include <string>
#include <cinttypes>
#include <utility>
#include <map>
#include <limits>
#include <sstream>
#include <locale>
#include <cinttypes>

namespace xgboost {
/*
Expand Down Expand Up @@ -86,6 +86,8 @@ class JsonReader {
msg += "\", got: \"";
if (got == -1) {
msg += "EOF\"";
} else if (got == 0) {
msg += "\\0\"";
} else {
msg += std::to_string(got) + " \"";
}
Expand Down
45 changes: 12 additions & 33 deletions src/common/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
#include <unistd.h>
#endif // defined(__unix__)
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <string>
#include <utility>
#include <cstdio>

#include "xgboost/logging.h"
#include "io.h"
Expand Down Expand Up @@ -108,39 +109,17 @@ std::string LoadSequentialFile(std::string fname) {
};

std::string buffer;
#if defined(__unix__)
struct stat fs;
if (stat(fname.c_str(), &fs) != 0) {
OpenErr();
}

size_t f_size_bytes = fs.st_size;
buffer.resize(f_size_bytes + 1);
int32_t fd = open(fname.c_str(), O_RDONLY);
#if defined(__linux__)
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
#endif // defined(__linux__)
ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes);
if (bytes_read < 0) {
close(fd);
ReadErr();
}
close(fd);
#else // defined(__unix__)
FILE *f = fopen(fname.c_str(), "r");
if (f == NULL) {
std::string msg;
OpenErr();
}
fseek(f, 0, SEEK_END);
auto fsize = ftell(f);
fseek(f, 0, SEEK_SET);

buffer.resize(fsize + 1);
fread(&buffer[0], 1, fsize, f);
fclose(f);
#endif // defined(__unix__)
// Open in binary mode so that correct file size can be computed with seekg().
// This accommodates Windows platform:
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in);
ifs.seekg(0, std::ios_base::end);
const size_t file_size = static_cast<size_t>(ifs.tellg());
ifs.seekg(0, std::ios_base::beg);
buffer.resize(file_size + 1);
ifs.read(&buffer[0], file_size);
buffer.back() = '\0';

return buffer;
}

Expand Down
1 change: 0 additions & 1 deletion src/common/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class FixedSizeStream : public PeekableInStream {
std::string buffer_;
};

// Optimized for consecutive file loading in unix like systime.
std::string LoadSequentialFile(std::string fname);

inline std::string FileExtension(std::string const& fname) {
Expand Down
2 changes: 2 additions & 0 deletions src/common/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,8 @@ void JsonReader::Error(std::string msg) const {
for (auto c : raw_portion) {
if (c == '\n') {
portion += "\\n";
} else if (c == '\0') {
portion += "\\0";
} else {
portion += c;
}
Expand Down

0 comments on commit efe3e48

Please sign in to comment.