From 5afbe67fa156b672c364f40e877e9636afdca19b Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 5 Jul 2020 20:51:32 +0800 Subject: [PATCH 1/3] Report error when failed to open file. --- src/common/io.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index cbf11ae0d347..b88df47f50a2 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -100,19 +100,17 @@ std::string LoadSequentialFile(std::string fname) { msg += strerror(errno); LOG(FATAL) << msg; }; - auto ReadErr = [&fname]() { - std::string msg {"Error in reading file: "}; - msg += fname; - msg += ": "; - msg += strerror(errno); - LOG(FATAL) << msg; - }; std::string buffer; // Open in binary mode so that correct file size can be computed with seekg(). // This accommodates Windows platform: // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in); + if (!ifs) { + // https://stackoverflow.com/a/17338934 + OpenErr(); + } + ifs.seekg(0, std::ios_base::end); const size_t file_size = static_cast(ifs.tellg()); ifs.seekg(0, std::ios_base::beg); From bba8ed732c31bfa45bcb9587f7af40d8568c8073 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 6 Jul 2020 04:55:34 +0800 Subject: [PATCH 2/3] Use dmlc stream when needed. --- src/common/io.cc | 66 ++++++++++++++++++++++++------------- src/common/io.h | 11 ++++++- tests/cpp/common/test_io.cc | 41 +++++++++++++++++++++++ 3 files changed, 94 insertions(+), 24 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index b88df47f50a2..6752f892f9d9 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -93,33 +93,53 @@ void FixedSizeStream::Take(std::string* out) { *out = std::move(buffer_); } -std::string LoadSequentialFile(std::string fname) { - auto OpenErr = [&fname]() { - std::string msg; - msg = "Opening " + fname + " failed: "; - msg += strerror(errno); - LOG(FATAL) << msg; - }; +std::string LoadSequentialFile(std::string uri, bool stream) { + auto OpenErr = [&uri]() { + std::string msg; + msg = "Opening " + uri + " failed: "; + msg += strerror(errno); + LOG(FATAL) << msg; + }; - std::string buffer; - // Open in binary mode so that correct file size can be computed with seekg(). - // This accommodates Windows platform: - // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg - std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in); - if (!ifs) { - // https://stackoverflow.com/a/17338934 - OpenErr(); - } + auto parsed = dmlc::io::URI(uri.c_str()); + // Read from file. + if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) { + std::string buffer; + // Open in binary mode so that correct file size can be computed with + // seekg(). This accommodates Windows platform: + // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg + std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in); + if (!ifs) { + // https://stackoverflow.com/a/17338934 + OpenErr(); + } + + ifs.seekg(0, std::ios_base::end); + const size_t file_size = static_cast(ifs.tellg()); + ifs.seekg(0, std::ios_base::beg); + buffer.resize(file_size + 1); + ifs.read(&buffer[0], file_size); + buffer.back() = '\0'; - ifs.seekg(0, std::ios_base::end); - const size_t file_size = static_cast(ifs.tellg()); - ifs.seekg(0, std::ios_base::beg); - buffer.resize(file_size + 1); - ifs.read(&buffer[0], file_size); - buffer.back() = '\0'; + return buffer; + } + // Read from remote. + std::unique_ptr fs{dmlc::Stream::Create(uri.c_str(), "r")}; + std::string buffer; + size_t constexpr kInitialSize = 4096; + size_t size {kInitialSize}, total {0}; + while (true) { + buffer.resize(total + size); + size_t read = fs->Read(&buffer[total], size); + total += read; + if (read < size) { + break; + } + size *= 2; + } + buffer.resize(total); return buffer; } - } // namespace common } // namespace xgboost diff --git a/src/common/io.h b/src/common/io.h index 4ae3fcb02147..d9544a6d140c 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -75,7 +75,16 @@ class FixedSizeStream : public PeekableInStream { std::string buffer_; }; -std::string LoadSequentialFile(std::string fname); +/*! + * \brief Helper function for loading consecutive file to avoid dmlc Stream when possible. + * + * \param uri URI or file name to file. + * \param stream Use dmlc Stream unconditionally if set to true. Used for running test + * without remote filesystem. + * + * \return File content. + */ +std::string LoadSequentialFile(std::string uri, bool stream = false); inline std::string FileExtension(std::string const& fname) { auto splited = Split(fname, '.'); diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 957c1748be5a..192cdc670740 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -2,6 +2,11 @@ * Copyright (c) by XGBoost Contributors 2019 */ #include +#include + +#include + +#include "../helpers.h" #include "../../../src/common/io.h" namespace xgboost { @@ -39,5 +44,41 @@ TEST(IO, FixedSizeStream) { ASSERT_EQ(huge_buffer, out_buffer); } } + +TEST(IO, LoadSequentialFile) { + EXPECT_THROW(LoadSequentialFile("non-exist"), dmlc::Error); + + dmlc::TemporaryDirectory tempdir; + std::ofstream fout(tempdir.path + "test_file"); + std::string content; + + // Generate a JSON file. + size_t constexpr kRows = 1000, kCols = 100; + std::shared_ptr p_dmat{ + RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)}; + std::unique_ptr learner { Learner::Create({p_dmat}) }; + learner->SetParam("tree_method", "hist"); + learner->Configure(); + + for (int32_t iter = 0; iter < 10; ++iter) { + learner->UpdateOneIter(iter, p_dmat); + } + Json out { Object() }; + learner->SaveModel(&out); + std::string str; + Json::Dump(out, &str); + + std::string tmpfile = tempdir.path + "/model.json"; + { + std::unique_ptr fo( + dmlc::Stream::Create(tmpfile.c_str(), "w")); + fo->Write(str.c_str(), str.size()); + } + + auto loaded = LoadSequentialFile(tmpfile, true); + ASSERT_EQ(loaded, str); + + ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error); +} } // namespace common } // namespace xgboost From 2054191b50e528b1d1cc903122f4f02c7d3bf0af Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 6 Jul 2020 05:29:14 +0800 Subject: [PATCH 3/3] Linter. --- src/common/io.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/io.cc b/src/common/io.cc index 6752f892f9d9..01bc1e36b3c1 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include