Thread-safe prediction by making the prediction cache thread-local.

Also remove prediction cache mutex, which did not help: a race condition still occured whenever the cache was cleaned but at the same time entries pointing into the cache were alive.
dmlc · Jul 3, 2020 · 310ac93 · 310ac93
1 parent 1a08012
commit 310ac93
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 12 deletions.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
@@ -55,7 +55,6 @@ struct PredictionCacheEntry {
 class PredictionContainer {
   std::unordered_map<DMatrix *, PredictionCacheEntry> container_;
   void ClearExpiredEntries();
-  std::mutex cache_lock_;
 
  public:
   PredictionContainer() = default;

diff --git a/src/learner.cc b/src/learner.cc
@@ -205,13 +205,12 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) {
 using XGBAPIThreadLocalStore =
     dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
 
+using ThreadLocalPredictionCache = dmlc::ThreadLocalStore<PredictionContainer>;
+
 class LearnerConfiguration : public Learner {
  protected:
   static std::string const kEvalMetric;  // NOLINT
 
- protected:
-  PredictionContainer cache_;
-
  protected:
   std::atomic<bool> need_configuration_;
   std::map<std::string, std::string> cfg_;
@@ -229,7 +228,8 @@ class LearnerConfiguration : public Learner {
       : need_configuration_{true} {
     monitor_.Init("Learner");
     for (std::shared_ptr<DMatrix> const& d : cache) {
-      cache_.Cache(d, GenericParameter::kCpuId);
+      auto local_cache = ThreadLocalPredictionCache::Get();
+      local_cache->Cache(d, GenericParameter::kCpuId);
     }
   }
   // Configuration before data is known.
@@ -491,7 +491,8 @@ class LearnerConfiguration : public Learner {
     if (mparam_.num_feature == 0) {
       // TODO(hcho3): Change num_feature to 64-bit integer
       unsigned num_feature = 0;
-      for (auto& matrix : cache_.Container()) {
+      auto local_cache = ThreadLocalPredictionCache::Get();
+      for (auto& matrix : local_cache->Container()) {
         CHECK(matrix.first);
         CHECK(!matrix.second.ref.expired());
         const uint64_t num_col = matrix.first->Info().num_col_;
@@ -928,7 +929,8 @@ class LearnerImpl : public LearnerIO {
     this->CheckDataSplitMode();
     this->ValidateDMatrix(train.get());
 
-    auto& predt = this->cache_.Cache(train, generic_parameters_.gpu_id);
+    auto local_cache = ThreadLocalPredictionCache::Get();
+    auto& predt = local_cache->Cache(train, generic_parameters_.gpu_id);
 
     monitor_.Start("PredictRaw");
     this->PredictRaw(train.get(), &predt, true);
@@ -953,9 +955,10 @@ class LearnerImpl : public LearnerIO {
     }
     this->CheckDataSplitMode();
     this->ValidateDMatrix(train.get());
-    this->cache_.Cache(train, generic_parameters_.gpu_id);
+    auto local_cache = ThreadLocalPredictionCache::Get();
+    local_cache->Cache(train, generic_parameters_.gpu_id);
 
-    gbm_->DoBoost(train.get(), in_gpair, &cache_.Entry(train.get()));
+    gbm_->DoBoost(train.get(), in_gpair, &local_cache->Entry(train.get()));
     monitor_.Stop("BoostOneIter");
   }
 
@@ -971,9 +974,11 @@ class LearnerImpl : public LearnerIO {
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &generic_parameters_));
       metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
     }
+
+    auto local_cache = ThreadLocalPredictionCache::Get();
     for (size_t i = 0; i < data_sets.size(); ++i) {
       std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = this->cache_.Cache(m, generic_parameters_.gpu_id);
+      auto &predt = local_cache->Cache(m, generic_parameters_.gpu_id);
       this->ValidateDMatrix(m.get());
       this->PredictRaw(m.get(), &predt, false);
 
@@ -1010,7 +1015,8 @@ class LearnerImpl : public LearnerIO {
     } else if (pred_leaf) {
       gbm_->PredictLeaf(data.get(), &out_preds->HostVector(), ntree_limit);
     } else {
-      auto& prediction = cache_.Cache(data, generic_parameters_.gpu_id);
+      auto local_cache = ThreadLocalPredictionCache::Get();
+      auto& prediction = local_cache->Cache(data, generic_parameters_.gpu_id);
       this->PredictRaw(data.get(), &prediction, training, ntree_limit);
       // Copy the prediction cache to output prediction. out_preds comes from C API
       out_preds->SetDevice(generic_parameters_.gpu_id);

diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
@@ -26,7 +26,6 @@ void PredictionContainer::ClearExpiredEntries() {
 }
 
 PredictionCacheEntry &PredictionContainer::Cache(std::shared_ptr<DMatrix> m, int32_t device) {
-  std::lock_guard<std::mutex> guard { cache_lock_ };
   this->ClearExpiredEntries();
   container_[m.get()].ref = m;
   if (device != GenericParameter::kCpuId) {