From ade6296072d2c8720178c2c223f4a0a204b6fd1e Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 20 Apr 2020 22:41:48 +0800 Subject: [PATCH 1/4] Refactor the CLI. * Enable parameter validation. * Enable JSON. * Catch `dmlc::Error`. * Show help message. --- doc/parameter.rst | 4 +- src/cli_main.cc | 510 +++++++++++++++++++++------------- src/common/config.h | 4 +- src/common/device_helpers.cuh | 1 - src/common/io.cc | 49 +++- src/common/io.h | 1 + src/common/random.h | 1 - src/objective/aft_obj.cc | 2 - src/tree/updater_gpu_hist.cu | 2 + tests/cpp/common/test_io.cc | 8 + tests/python/test_cli.py | 54 +++- 11 files changed, 430 insertions(+), 206 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index 99c70dcacf7b..d6d62e7ec23f 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -30,11 +30,11 @@ General Parameters is displayed as warning message. If there's unexpected behaviour, please try to increase value of verbosity. -* ``validate_parameters`` [default to false, except for Python and R interface] +* ``validate_parameters`` [default to false, except for Python, R and CLI interface] - When set to True, XGBoost will perform validation of input parameters to check whether a parameter is used or not. The feature is still experimental. It's expected to have - some false positives, especially when used with Scikit-Learn interface. + some false positives. * ``nthread`` [default to maximum number of threads available if not set] diff --git a/src/cli_main.cc b/src/cli_main.cc index f9f3b59e40cf..45dbb18d2e91 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -4,17 +4,17 @@ * \brief The command line interface program of xgboost. * This file is not included in dynamic library. */ -// Copyright 2014 by Contributors #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX +#include #include #include +#include #include #include -#include #include #include #include @@ -23,9 +23,9 @@ #include #include "./common/common.h" #include "./common/config.h" +#include "common/io.h" namespace xgboost { - enum CLITask { kTrain = 0, kDumpModel = 1, @@ -74,6 +74,8 @@ struct CLIParam : public XGBoostParameter { /*! \brief all the configurations */ std::vector > cfg; + static constexpr char const* const kNull = "NULL"; + // declare parameters DMLC_DECLARE_PARAMETER(CLIParam) { // NOTE: declare everything except eval_data_paths. @@ -124,15 +126,18 @@ struct CLIParam : public XGBoostParameter { } // customized configure function of CLIParam inline void Configure(const std::vector >& _cfg) { - this->cfg = _cfg; - this->UpdateAllowUnknown(_cfg); - for (const auto& kv : _cfg) { + // Don't copy the configuration to enable parameter validation. + auto unknown_cfg = this->UpdateAllowUnknown(_cfg); + this->cfg.emplace_back("validate_parameters", "True"); + for (const auto& kv : unknown_cfg) { if (!strncmp("eval[", kv.first.c_str(), 5)) { char evname[256]; CHECK_EQ(sscanf(kv.first.c_str(), "eval[%[^]]", evname), 1) << "must specify evaluation name for display"; eval_data_names.emplace_back(evname); eval_data_paths.push_back(kv.second); + } else { + this->cfg.emplace_back(kv); } } // constraint. @@ -145,221 +150,354 @@ struct CLIParam : public XGBoostParameter { } }; +constexpr char const* const CLIParam::kNull; + DMLC_REGISTER_PARAMETER(CLIParam); -void CLITrain(const CLIParam& param) { - const double tstart_data_load = dmlc::GetTime(); - if (rabit::IsDistributed()) { - std::string pname = rabit::GetProcessorName(); - LOG(CONSOLE) << "start " << pname << ":" << rabit::GetRank(); +std::string CliHelp() { + return "Use xgboost -h for showing help message.\n"; +} + +void CLIError(dmlc::Error const& e) { + std::cerr << "Error running xgboost:\n\n" + << e.what() << "\n" + << CliHelp() + << std::endl; +} + +class CLI { + CLIParam param_; + std::unique_ptr learner_; + bool print_help_ { false }; + + int ResetLearner(std::vector> const& matrics) { + learner_.reset(Learner::Create(matrics)); + int version = rabit::LoadCheckPoint(learner_.get()); + if (version == 0) { + if (param_.model_in != CLIParam::kNull) { + this->LoadModel(param_.model_in, learner_.get()); + learner_->SetParams(param_.cfg); + } else { + learner_->SetParams(param_.cfg); + } + } + learner_->Configure(); + return version; } - // load in data. - std::shared_ptr dtrain( - DMatrix::Load( - param.train_path, + + void CLITrain() { + const double tstart_data_load = dmlc::GetTime(); + if (rabit::IsDistributed()) { + std::string pname = rabit::GetProcessorName(); + LOG(CONSOLE) << "start " << pname << ":" << rabit::GetRank(); + } + // load in data. + std::shared_ptr dtrain(DMatrix::Load( + param_.train_path, + ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(), + param_.dsplit == 2)); + std::vector> deval; + std::vector> cache_mats; + std::vector> eval_datasets; + cache_mats.push_back(dtrain); + for (size_t i = 0; i < param_.eval_data_names.size(); ++i) { + deval.emplace_back(std::shared_ptr(DMatrix::Load( + param_.eval_data_paths[i], ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(), - param.dsplit == 2)); - std::vector > deval; - std::vector > cache_mats; - std::vector> eval_datasets; - cache_mats.push_back(dtrain); - for (size_t i = 0; i < param.eval_data_names.size(); ++i) { - deval.emplace_back( - std::shared_ptr(DMatrix::Load( - param.eval_data_paths[i], - ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(), - param.dsplit == 2))); - eval_datasets.push_back(deval.back()); - cache_mats.push_back(deval.back()); - } - std::vector eval_data_names = param.eval_data_names; - if (param.eval_train) { - eval_datasets.push_back(dtrain); - eval_data_names.emplace_back("train"); - } - // initialize the learner. - std::unique_ptr learner(Learner::Create(cache_mats)); - int version = rabit::LoadCheckPoint(learner.get()); - if (version == 0) { - // initialize the model if needed. - if (param.model_in != "NULL") { - std::unique_ptr fi( - dmlc::Stream::Create(param.model_in.c_str(), "r")); - learner->LoadModel(fi.get()); - learner->SetParams(param.cfg); - } else { - learner->SetParams(param.cfg); + param_.dsplit == 2))); + eval_datasets.push_back(deval.back()); + cache_mats.push_back(deval.back()); } - } - LOG(INFO) << "Loading data: " << dmlc::GetTime() - tstart_data_load << " sec"; + std::vector eval_data_names = param_.eval_data_names; + if (param_.eval_train) { + eval_datasets.push_back(dtrain); + eval_data_names.emplace_back("train"); + } + // initialize the learner. + int32_t version = this->ResetLearner(cache_mats); + LOG(INFO) << "Loading data: " << dmlc::GetTime() - tstart_data_load + << " sec"; - // start training. - const double start = dmlc::GetTime(); - for (int i = version / 2; i < param.num_round; ++i) { - double elapsed = dmlc::GetTime() - start; - if (version % 2 == 0) { - LOG(INFO) << "boosting round " << i << ", " << elapsed << " sec elapsed"; - learner->UpdateOneIter(i, dtrain); - if (learner->AllowLazyCheckPoint()) { - rabit::LazyCheckPoint(learner.get()); + // start training. + const double start = dmlc::GetTime(); + for (int i = version / 2; i < param_.num_round; ++i) { + double elapsed = dmlc::GetTime() - start; + if (version % 2 == 0) { + LOG(INFO) << "boosting round " << i << ", " << elapsed + << " sec elapsed"; + learner_->UpdateOneIter(i, dtrain); + if (learner_->AllowLazyCheckPoint()) { + rabit::LazyCheckPoint(learner_.get()); + } else { + rabit::CheckPoint(learner_.get()); + } + version += 1; + } + CHECK_EQ(version, rabit::VersionNumber()); + std::string res = learner_->EvalOneIter(i, eval_datasets, eval_data_names); + if (rabit::IsDistributed()) { + if (rabit::GetRank() == 0) { + LOG(TRACKER) << res; + } + } else { + LOG(CONSOLE) << res; + } + if (param_.save_period != 0 && (i + 1) % param_.save_period == 0 && + rabit::GetRank() == 0) { + std::ostringstream os; + os << param_.model_dir << '/' << std::setfill('0') << std::setw(4) + << i + 1 << ".model"; + this->SaveModel(os.str(), learner_.get()); + } + + if (learner_->AllowLazyCheckPoint()) { + rabit::LazyCheckPoint(learner_.get()); } else { - rabit::CheckPoint(learner.get()); + rabit::CheckPoint(learner_.get()); } version += 1; + CHECK_EQ(version, rabit::VersionNumber()); } - CHECK_EQ(version, rabit::VersionNumber()); - std::string res = learner->EvalOneIter(i, eval_datasets, eval_data_names); - if (rabit::IsDistributed()) { - if (rabit::GetRank() == 0) { - LOG(TRACKER) << res; + LOG(INFO) << "Complete Training loop time: " << dmlc::GetTime() - start + << " sec"; + // always save final round + if ((param_.save_period == 0 || + param_.num_round % param_.save_period != 0) && + param_.model_out != CLIParam::kNull && rabit::GetRank() == 0) { + std::ostringstream os; + if (param_.model_out == CLIParam::kNull) { + os << param_.model_dir << '/' << std::setfill('0') << std::setw(4) + << param_.num_round << ".model"; + } else { + os << param_.model_out; + } + this->SaveModel(os.str(), learner_.get()); + } + + double elapsed = dmlc::GetTime() - start; + LOG(INFO) << "update end, " << elapsed << " sec in all"; + } + + void CLIDumpModel() { + FeatureMap fmap; + if (param_.name_fmap != CLIParam::kNull) { + std::unique_ptr fs( + dmlc::Stream::Create(param_.name_fmap.c_str(), "r")); + dmlc::istream is(fs.get()); + fmap.LoadText(is); + } + // load model + CHECK_NE(param_.model_in, CLIParam::kNull) << "Must specify model_in for dump"; + this->ResetLearner({}); + + // dump data + std::vector dump = + learner_->DumpModel(fmap, param_.dump_stats, param_.dump_format); + std::unique_ptr fo( + dmlc::Stream::Create(param_.name_dump.c_str(), "w")); + dmlc::ostream os(fo.get()); + if (param_.dump_format == "json") { + os << "[" << std::endl; + for (size_t i = 0; i < dump.size(); ++i) { + if (i != 0) { + os << "," << std::endl; + } + os << dump[i]; // Dump the previously generated JSON here } + os << std::endl << "]" << std::endl; } else { - LOG(CONSOLE) << res; + for (size_t i = 0; i < dump.size(); ++i) { + os << "booster[" << i << "]:\n"; + os << dump[i]; + } } - if (param.save_period != 0 && - (i + 1) % param.save_period == 0 && - rabit::GetRank() == 0) { - std::ostringstream os; - os << param.model_dir << '/' - << std::setfill('0') << std::setw(4) - << i + 1 << ".model"; - std::unique_ptr fo( - dmlc::Stream::Create(os.str().c_str(), "w")); - learner->SaveModel(fo.get()); + // force flush before fo destruct. + os.set_stream(nullptr); + } + + void CLIPredict() { + CHECK_NE(param_.test_path, CLIParam::kNull) + << "Test dataset parameter test:data must be specified."; + // load data + std::shared_ptr dtest(DMatrix::Load( + param_.test_path, + ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(), + param_.dsplit == 2)); + // load model + CHECK_NE(param_.model_in, CLIParam::kNull) << "Must specify model_in for predict"; + this->ResetLearner({}); + + LOG(INFO) << "Start prediction..."; + HostDeviceVector preds; + learner_->Predict(dtest, param_.pred_margin, &preds, param_.ntree_limit); + LOG(CONSOLE) << "Writing prediction to " << param_.name_pred; + + std::unique_ptr fo( + dmlc::Stream::Create(param_.name_pred.c_str(), "w")); + dmlc::ostream os(fo.get()); + for (bst_float p : preds.ConstHostVector()) { + os << std::setprecision(std::numeric_limits::max_digits10) << p + << '\n'; } + // force flush before fo destruct. + os.set_stream(nullptr); + } - if (learner->AllowLazyCheckPoint()) { - rabit::LazyCheckPoint(learner.get()); + void LoadModel(std::string const& path, Learner* learner) const { + if (common::FileExtension(path) == "json") { + auto str = common::LoadSequentialFile(path); + CHECK_GT(str.size(), 2); + CHECK_EQ(str[0], '{'); + Json in{Json::Load({str.c_str(), str.size()})}; + learner->LoadModel(in); } else { - rabit::CheckPoint(learner.get()); + std::unique_ptr fi(dmlc::Stream::Create(path.c_str(), "r")); + learner->LoadModel(fi.get()); } - version += 1; - CHECK_EQ(version, rabit::VersionNumber()); } - LOG(INFO) << "Complete Training loop time: " << dmlc::GetTime() - start << " sec"; - // always save final round - if ((param.save_period == 0 || param.num_round % param.save_period != 0) && - param.model_out != "NONE" && - rabit::GetRank() == 0) { - std::ostringstream os; - if (param.model_out == "NULL") { - os << param.model_dir << '/' - << std::setfill('0') << std::setw(4) - << param.num_round << ".model"; + + void SaveModel(std::string const& path, Learner* learner) const { + learner->Configure(); + std::unique_ptr fo(dmlc::Stream::Create(path.c_str(), "w")); + if (common::FileExtension(path) == "json") { + Json out{Object()}; + learner->SaveModel(&out); + std::string str; + Json::Dump(out, &str); + fo->Write(str.c_str(), str.size()); } else { - os << param.model_out; + learner->SaveModel(fo.get()); } - std::unique_ptr fo( - dmlc::Stream::Create(os.str().c_str(), "w")); - learner->SaveModel(fo.get()); } - double elapsed = dmlc::GetTime() - start; - LOG(INFO) << "update end, " << elapsed << " sec in all"; -} + void PrintHelp() const { + std::cout << "Usage: xgboost [ -h ] [ config file ] [ arguments ]" << std::endl; + std::stringstream ss; + ss << R"( + Options and arguments: + -h, --help + Print this message. + + arguments + Extra parameters that are not specified in config file, see below. -void CLIDumpModel(const CLIParam& param) { - FeatureMap fmap; - if (param.name_fmap != "NULL") { - std::unique_ptr fs( - dmlc::Stream::Create(param.name_fmap.c_str(), "r")); - dmlc::istream is(fs.get()); - fmap.LoadText(is); + Config file specifies the configuration for both training and testing. Each line + containing the [attribute] = [value] configuration. + + General XGBoost parameters: + + https://xgboost.readthedocs.io/en/latest/parameter.html + + Command line interface specfic parameters: + +)"; + + std::string help = param_.__DOC__(); + auto splited = common::Split(help, '\n'); + for (auto str : splited) { + ss << " " << str << '\n'; + } + ss << R"( eval[NAME]: string, optional, default='NULL' + Path to evaluation data, with NAME as data name. +)"; + + ss << R"( + Example: train.conf + + booster = gbtree + objective = reg:squarederror + eta = 1.0 + gamma = 1.0 + seed = 0 + min_child_weight = 0 + max_depth = 3 + + num_round = 2 + save_period = 0 + data = "demo/data/agaricus.txt.train?format=libsvm" + eval[test] = "demo/data/agaricus.txt.test?format=libsvm" + + See demo/ directory in XGBoost for more examples. +)"; + std::cout << ss.str() << std::endl; } - // load model - CHECK_NE(param.model_in, "NULL") - << "Must specify model_in for dump"; - std::unique_ptr learner(Learner::Create({})); - std::unique_ptr fi( - dmlc::Stream::Create(param.model_in.c_str(), "r")); - learner->SetParams(param.cfg); - learner->LoadModel(fi.get()); - // dump data - std::vector dump = learner->DumpModel( - fmap, param.dump_stats, param.dump_format); - std::unique_ptr fo( - dmlc::Stream::Create(param.name_dump.c_str(), "w")); - dmlc::ostream os(fo.get()); - if (param.dump_format == "json") { - os << "[" << std::endl; - for (size_t i = 0; i < dump.size(); ++i) { - if (i != 0) os << "," << std::endl; - os << dump[i]; // Dump the previously generated JSON here + + public: + CLI(int argc, char* argv[]) { + if (argc < 2) { + this->PrintHelp(); + exit(1); } - os << std::endl << "]" << std::endl; - } else { - for (size_t i = 0; i < dump.size(); ++i) { - os << "booster[" << i << "]:\n"; - os << dump[i]; + for (int i = 0; i < argc; ++i) { + std::string str {argv[i]}; + if (str == "-h" || str == "--help") { + print_help_ = true; + break; + } + } + if (print_help_) { + return; } - } - // force flush before fo destruct. - os.set_stream(nullptr); -} -void CLIPredict(const CLIParam& param) { - CHECK_NE(param.test_path, "NULL") - << "Test dataset parameter test:data must be specified."; - // load data - std::shared_ptr dtest( - DMatrix::Load( - param.test_path, - ConsoleLogger::GlobalVerbosity() > ConsoleLogger::DefaultVerbosity(), - param.dsplit == 2)); - // load model - CHECK_NE(param.model_in, "NULL") - << "Must specify model_in for predict"; - std::unique_ptr learner(Learner::Create({})); - std::unique_ptr fi( - dmlc::Stream::Create(param.model_in.c_str(), "r")); - learner->LoadModel(fi.get()); - learner->SetParams(param.cfg); - - LOG(INFO) << "start prediction..."; - HostDeviceVector preds; - learner->Predict(dtest, param.pred_margin, &preds, param.ntree_limit); - LOG(CONSOLE) << "writing prediction to " << param.name_pred; - - std::unique_ptr fo( - dmlc::Stream::Create(param.name_pred.c_str(), "w")); - dmlc::ostream os(fo.get()); - for (bst_float p : preds.ConstHostVector()) { - os << std::setprecision(std::numeric_limits::max_digits10) - << p << '\n'; - } - // force flush before fo destruct. - os.set_stream(nullptr); -} + rabit::Init(argc, argv); + std::string config_path = argv[1]; + if (!common::CanReadFile(config_path)) { + std::cerr << "Failed to open config file: \"" << argv[1] << "\"\n" + << CliHelp(); + exit(1); + } -int CLIRunTask(int argc, char *argv[]) { - if (argc < 2) { - printf("Usage: \n"); - return 0; - } - rabit::Init(argc, argv); + common::ConfigParser cp(config_path); + auto cfg = cp.Parse(); + + for (int i = 2; i < argc; ++i) { + char name[256], val[256]; + if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) { + cfg.emplace_back(std::string(name), std::string(val)); + } + } - common::ConfigParser cp(argv[1]); - auto cfg = cp.Parse(); + param_.Configure(cfg); + } - for (int i = 2; i < argc; ++i) { - char name[256], val[256]; - if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) { - cfg.emplace_back(std::string(name), std::string(val)); + int Run() { + if (print_help_) { + this->PrintHelp(); + return 0; } + try { + switch (param_.task) { + case kTrain: + CLITrain(); + break; + case kDumpModel: + CLIDumpModel(); + break; + case kPredict: + CLIPredict(); + break; + } + } catch (dmlc::Error const& e) { + xgboost::CLIError(e); + return 1; + } + return 0; } - CLIParam param; - param.Configure(cfg); - switch (param.task) { - case kTrain: CLITrain(param); break; - case kDumpModel: CLIDumpModel(param); break; - case kPredict: CLIPredict(param); break; + ~CLI() { + rabit::Finalize(); } - rabit::Finalize(); - return 0; -} +}; } // namespace xgboost int main(int argc, char *argv[]) { - return xgboost::CLIRunTask(argc, argv); + try { + xgboost::CLI cli(argc, argv); + return cli.Run(); + } catch (dmlc::Error const& e) { + // This captures only the initialization error. + xgboost::CLIError(e); + return 1; + } + return 0; } diff --git a/src/common/config.h b/src/common/config.h index f10d865107cd..e48a9d9c80b4 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -7,8 +7,6 @@ #ifndef XGBOOST_COMMON_CONFIG_H_ #define XGBOOST_COMMON_CONFIG_H_ -#include -#include #include #include #include @@ -18,6 +16,8 @@ #include #include +#include "xgboost/logging.h" + namespace xgboost { namespace common { /*! diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index daa546b9d95f..942cda159bd4 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -30,7 +30,6 @@ #ifdef XGBOOST_USE_NCCL #include "nccl.h" -#include "../common/io.h" #endif #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) diff --git a/src/common/io.cc b/src/common/io.cc index 7a19acd9b528..3f1a0c82ce09 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -1,19 +1,48 @@ /*! - * Copyright (c) by XGBoost Contributors 2019 + * Copyright (c) by XGBoost Contributors 2019-2020 */ -#if defined(__unix__) +#define XGBOOST_IS_UNIX() \ + defined(__unix__) || defined(unix) || defined(__unix) || defined(__APPLE__) + +#if XGBOOST_IS_UNIX() #include #include #include -#endif // defined(__unix__) +#include +#endif // XGBOOST_IS_UNIX() + #include #include +#include #include #include #include "xgboost/logging.h" #include "io.h" +#ifndef S_ISDIR + +#ifdef _MSC_VER +#include +// from: https://www.linuxquestions.org/questions/programming-9/porting-to-win32-429334/ +#define XGBOOST_S_ISDIR(mode) (((mode) & _S_IFMT) == _S_IFDIR) +#endif // _MSC_VER + +#else + +#define XGBOOST_S_ISDIR(mode) S_ISDIR((mode)) +#endif // S_ISDIR + +namespace { +int Access(char const* path, int type) { +#if defined(_MSC_VER) + return _access(path, type); +#else + return access(path, type); +#endif // defined(_MSC_VER) +} +} // anonymous namespace + namespace xgboost { namespace common { @@ -108,7 +137,9 @@ std::string LoadSequentialFile(std::string fname) { }; std::string buffer; -#if defined(__unix__) + CHECK(CanReadFile(fname.c_str())) + << "Failed to read file: " << fname; +#if XGBOOST_IS_UNIX() struct stat fs; if (stat(fname.c_str(), &fs) != 0) { OpenErr(); @@ -137,10 +168,18 @@ std::string LoadSequentialFile(std::string fname) { buffer.resize(fsize + 1); fread(&buffer[0], 1, fsize, f); fclose(f); -#endif // defined(__unix__) +#endif // XGBOOST_IS_UNIX() buffer.back() = '\0'; return buffer; } +bool CanReadFile(std::string const& path) { + struct stat path_stat; + stat(path.c_str(), &path_stat); + return Access(path.c_str(), 0) == 0 && !XGBOOST_S_ISDIR(path_stat.st_mode); +} } // namespace common } // namespace xgboost + +#undef XGBOOST_S_ISDIR +#undef XGBOOST_IS_UNIX diff --git a/src/common/io.h b/src/common/io.h index 528296dc76cc..c9f6ceb0ed4d 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -87,6 +87,7 @@ inline std::string FileExtension(std::string const& fname) { } } +bool CanReadFile(std::string const& path); } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_IO_H_ diff --git a/src/common/random.h b/src/common/random.h index 0d1c08bbeec2..45af80ce030b 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -18,7 +18,6 @@ #include #include "xgboost/host_device_vector.h" -#include "io.h" namespace xgboost { namespace common { diff --git a/src/objective/aft_obj.cc b/src/objective/aft_obj.cc index 0a5df1391ab4..1357901965fb 100644 --- a/src/objective/aft_obj.cc +++ b/src/objective/aft_obj.cc @@ -115,5 +115,3 @@ XGBOOST_REGISTER_OBJECTIVE(AFTObj, "survival:aft") } // namespace obj } // namespace xgboost - - diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 67848b1e27c3..b2a501378f8a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -17,10 +17,12 @@ #include "xgboost/span.h" #include "xgboost/json.h" +#include "../common/io.h" #include "../common/device_helpers.cuh" #include "../common/hist_util.h" #include "../common/timer.h" #include "../data/ellpack_page.cuh" + #include "param.h" #include "updater_gpu_common.cuh" #include "constraints.cuh" diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 957c1748be5a..4d432704cbc4 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -39,5 +39,13 @@ TEST(IO, FixedSizeStream) { ASSERT_EQ(huge_buffer, out_buffer); } } + +TEST(IO, CanReadFile) { + // Non-exist file. + ASSERT_FALSE(CanReadFile("foo")); + // Can't read directory + ASSERT_FALSE(CanReadFile("./")); + ASSERT_TRUE(CanReadFile(__FILE__)); +} } // namespace common } // namespace xgboost diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index b3391b7b1c6a..9d5d03fe790c 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -5,6 +5,7 @@ import xgboost import subprocess import numpy +import json class TestCLI(unittest.TestCase): @@ -27,20 +28,23 @@ class TestCLI(unittest.TestCase): eval[test] = {data_path} ''' - def test_cli_model(self): - curdir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) - project_root = os.path.normpath( - os.path.join(curdir, os.path.pardir, os.path.pardir)) - data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( - root=project_root) + curdir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) + project_root = os.path.normpath( + os.path.join(curdir, os.path.pardir, os.path.pardir)) + def get_exe(self): if platform.system() == 'Windows': exe = 'xgboost.exe' else: exe = 'xgboost' - exe = os.path.join(project_root, exe) + exe = os.path.join(self.project_root, exe) assert os.path.exists(exe) + return exe + def test_cli_model(self): + data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( + root=self.project_root) + exe = self.get_exe() seed = 1994 with tempfile.TemporaryDirectory() as tmpdir: @@ -102,3 +106,39 @@ def test_cli_model(self): py_model_bin = fd.read() assert hash(cli_model_bin) == hash(py_model_bin) + + def test_cli_help(self): + exe = self.get_exe() + completed = subprocess.run([exe], stdout=subprocess.PIPE) + error_msg = completed.stdout.decode('utf-8') + ret = completed.returncode + assert ret == 1 + assert error_msg.find('Usage') != -1 + assert error_msg.find('eval[NAME]') != -1 + + def test_cli_model_json(self): + exe = self.get_exe() + data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( + root=self.project_root) + seed = 1994 + + with tempfile.TemporaryDirectory() as tmpdir: + model_out_cli = os.path.join( + tmpdir, 'test_load_cli_model-cli.json') + config_path = os.path.join(tmpdir, 'test_load_cli_model.conf') + + train_conf = self.template.format(data_path=data_path, + seed=seed, + task='train', + model_in='NULL', + model_out=model_out_cli, + test_path='NULL', + name_pred='NULL') + with open(config_path, 'w') as fd: + fd.write(train_conf) + + subprocess.run([exe, config_path]) + with open(model_out_cli, 'r') as fd: + model = json.load(fd) + + assert model['learner']['gradient_booster']['name'] == 'gbtree' From 9b32ae0b664f2ce1648f0b1cf48b4d42e2a424ee Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 22 Apr 2020 19:21:03 +0800 Subject: [PATCH 2/4] Just use std exception. --- src/cli_main.cc | 5 ---- src/common/config.h | 12 ++++++--- src/common/io.cc | 49 ++++--------------------------------- src/common/io.h | 1 - tests/cpp/common/test_io.cc | 8 ------ 5 files changed, 14 insertions(+), 61 deletions(-) diff --git a/src/cli_main.cc b/src/cli_main.cc index 45dbb18d2e91..b01797aaecbf 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -441,11 +441,6 @@ class CLI { rabit::Init(argc, argv); std::string config_path = argv[1]; - if (!common::CanReadFile(config_path)) { - std::cerr << "Failed to open config file: \"" << argv[1] << "\"\n" - << CliHelp(); - exit(1); - } common::ConfigParser cp(config_path); auto cfg = cp.Parse(); diff --git a/src/common/config.h b/src/common/config.h index e48a9d9c80b4..0a19d851bca0 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -41,9 +41,15 @@ class ConfigParser { std::string LoadConfigFile(const std::string& path) { std::ifstream fin(path, std::ios_base::in | std::ios_base::binary); CHECK(fin) << "Failed to open: " << path; - std::string content{std::istreambuf_iterator(fin), - std::istreambuf_iterator()}; - return content; + try { + std::string content{std::istreambuf_iterator(fin), + std::istreambuf_iterator()}; + return content; + } catch (std::ios_base::failure const &e) { + LOG(FATAL) << "Failed to read: \"" << path << "\"\n" + << e.what(); + } + return ""; } /*! diff --git a/src/common/io.cc b/src/common/io.cc index 3f1a0c82ce09..7a19acd9b528 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -1,48 +1,19 @@ /*! - * Copyright (c) by XGBoost Contributors 2019-2020 + * Copyright (c) by XGBoost Contributors 2019 */ -#define XGBOOST_IS_UNIX() \ - defined(__unix__) || defined(unix) || defined(__unix) || defined(__APPLE__) - -#if XGBOOST_IS_UNIX() +#if defined(__unix__) #include #include #include -#include -#endif // XGBOOST_IS_UNIX() - +#endif // defined(__unix__) #include #include -#include #include #include #include "xgboost/logging.h" #include "io.h" -#ifndef S_ISDIR - -#ifdef _MSC_VER -#include -// from: https://www.linuxquestions.org/questions/programming-9/porting-to-win32-429334/ -#define XGBOOST_S_ISDIR(mode) (((mode) & _S_IFMT) == _S_IFDIR) -#endif // _MSC_VER - -#else - -#define XGBOOST_S_ISDIR(mode) S_ISDIR((mode)) -#endif // S_ISDIR - -namespace { -int Access(char const* path, int type) { -#if defined(_MSC_VER) - return _access(path, type); -#else - return access(path, type); -#endif // defined(_MSC_VER) -} -} // anonymous namespace - namespace xgboost { namespace common { @@ -137,9 +108,7 @@ std::string LoadSequentialFile(std::string fname) { }; std::string buffer; - CHECK(CanReadFile(fname.c_str())) - << "Failed to read file: " << fname; -#if XGBOOST_IS_UNIX() +#if defined(__unix__) struct stat fs; if (stat(fname.c_str(), &fs) != 0) { OpenErr(); @@ -168,18 +137,10 @@ std::string LoadSequentialFile(std::string fname) { buffer.resize(fsize + 1); fread(&buffer[0], 1, fsize, f); fclose(f); -#endif // XGBOOST_IS_UNIX() +#endif // defined(__unix__) buffer.back() = '\0'; return buffer; } -bool CanReadFile(std::string const& path) { - struct stat path_stat; - stat(path.c_str(), &path_stat); - return Access(path.c_str(), 0) == 0 && !XGBOOST_S_ISDIR(path_stat.st_mode); -} } // namespace common } // namespace xgboost - -#undef XGBOOST_S_ISDIR -#undef XGBOOST_IS_UNIX diff --git a/src/common/io.h b/src/common/io.h index c9f6ceb0ed4d..528296dc76cc 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -87,7 +87,6 @@ inline std::string FileExtension(std::string const& fname) { } } -bool CanReadFile(std::string const& path); } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_IO_H_ diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 4d432704cbc4..957c1748be5a 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -39,13 +39,5 @@ TEST(IO, FixedSizeStream) { ASSERT_EQ(huge_buffer, out_buffer); } } - -TEST(IO, CanReadFile) { - // Non-exist file. - ASSERT_FALSE(CanReadFile("foo")); - // Can't read directory - ASSERT_FALSE(CanReadFile("./")); - ASSERT_TRUE(CanReadFile(__FILE__)); -} } // namespace common } // namespace xgboost From 69d4c048c36da3d96ec6b6412ba52653144c8535 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 22 Apr 2020 19:47:40 +0800 Subject: [PATCH 3/4] Print version. --- src/cli_main.cc | 44 ++++++++++++++++++++++++++++++++-------- src/common/config.h | 4 ++-- tests/python/test_cli.py | 9 ++++++++ 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/cli_main.cc b/src/cli_main.cc index b01797aaecbf..649171e45d7e 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -21,9 +21,10 @@ #include #include #include -#include "./common/common.h" -#include "./common/config.h" +#include "common/common.h" +#include "common/config.h" #include "common/io.h" +#include "common/version.h" namespace xgboost { enum CLITask { @@ -155,7 +156,7 @@ constexpr char const* const CLIParam::kNull; DMLC_REGISTER_PARAMETER(CLIParam); std::string CliHelp() { - return "Use xgboost -h for showing help message.\n"; + return "Use xgboost -h for showing help information.\n"; } void CLIError(dmlc::Error const& e) { @@ -168,7 +169,11 @@ void CLIError(dmlc::Error const& e) { class CLI { CLIParam param_; std::unique_ptr learner_; - bool print_help_ { false }; + enum Print { + kNone, + kVersion, + kHelp + } print_info_ {kNone}; int ResetLearner(std::vector> const& matrics) { learner_.reset(Learner::Create(matrics)); @@ -371,13 +376,17 @@ class CLI { } void PrintHelp() const { - std::cout << "Usage: xgboost [ -h ] [ config file ] [ arguments ]" << std::endl; + std::cout << "Usage: xgboost [ -h ] [ -V ] [ config file ] [ arguments ]" << std::endl; std::stringstream ss; ss << R"( Options and arguments: + -h, --help Print this message. + -V, --version + Print XGBoost version. + arguments Extra parameters that are not specified in config file, see below. @@ -404,6 +413,7 @@ class CLI { ss << R"( Example: train.conf + # General parameters booster = gbtree objective = reg:squarederror eta = 1.0 @@ -412,6 +422,7 @@ class CLI { min_child_weight = 0 max_depth = 3 + # Training arguments for CLI. num_round = 2 save_period = 0 data = "demo/data/agaricus.txt.train?format=libsvm" @@ -422,6 +433,11 @@ class CLI { std::cout << ss.str() << std::endl; } + void PrintVersion() const { + auto ver = Version::String(Version::Self()); + std::cout << "XGBoost: " << ver << std::endl; + } + public: CLI(int argc, char* argv[]) { if (argc < 2) { @@ -431,11 +447,14 @@ class CLI { for (int i = 0; i < argc; ++i) { std::string str {argv[i]}; if (str == "-h" || str == "--help") { - print_help_ = true; + print_info_ = kHelp; + break; + } else if (str == "-V" || str == "--version") { + print_info_ = kVersion; break; } } - if (print_help_) { + if (print_info_ != kNone) { return; } @@ -456,10 +475,19 @@ class CLI { } int Run() { - if (print_help_) { + switch (this->print_info_) { + case kNone: + break; + case kVersion: { + this->PrintVersion(); + return 0; + } + case kHelp: { this->PrintHelp(); return 0; } + } + try { switch (param_.task) { case kTrain: diff --git a/src/common/config.h b/src/common/config.h index 0a19d851bca0..8070c67286d8 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -40,13 +40,13 @@ class ConfigParser { std::string LoadConfigFile(const std::string& path) { std::ifstream fin(path, std::ios_base::in | std::ios_base::binary); - CHECK(fin) << "Failed to open: " << path; + CHECK(fin) << "Failed to open config file: \"" << path << "\""; try { std::string content{std::istreambuf_iterator(fin), std::istreambuf_iterator()}; return content; } catch (std::ios_base::failure const &e) { - LOG(FATAL) << "Failed to read: \"" << path << "\"\n" + LOG(FATAL) << "Failed to read config file: \"" << path << "\"\n" << e.what(); } return ""; diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index 9d5d03fe790c..f953840b1950 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -116,6 +116,15 @@ def test_cli_help(self): assert error_msg.find('Usage') != -1 assert error_msg.find('eval[NAME]') != -1 + completed = subprocess.run([exe, '-V'], stdout=subprocess.PIPE) + msg = completed.stdout.decode('utf-8') + assert msg.find('XGBoost') != -1 + v = xgboost.__version__ + if v.find('SNAPSHOT') != -1: + assert msg.split(':')[1].strip() == v.split('-')[0] + else: + assert msg.split(':')[1].strip() == v + def test_cli_model_json(self): exe = self.get_exe() data_path = "{root}/demo/data/agaricus.txt.train?format=libsvm".format( From b3ad1da8d7b4f383ae8f25ee671d6c3a4ab29782 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 26 Apr 2020 09:59:46 +0800 Subject: [PATCH 4/4] Typo. --- src/cli_main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli_main.cc b/src/cli_main.cc index 649171e45d7e..5c602f374b2a 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -175,8 +175,8 @@ class CLI { kHelp } print_info_ {kNone}; - int ResetLearner(std::vector> const& matrics) { - learner_.reset(Learner::Create(matrics)); + int ResetLearner(std::vector> const &matrices) { + learner_.reset(Learner::Create(matrices)); int version = rabit::LoadCheckPoint(learner_.get()); if (version == 0) { if (param_.model_in != CLIParam::kNull) {