Skip to content

Commit

Permalink
[cherry pick] Fix issue #33021 setCacheCapacity could not limit memor…
Browse files Browse the repository at this point in the history
…y consumption (#33571)

* [oneDNN] First fix to #33021  (#33174)

* - First fix to #33021

* [oneDNN] Second fix to #33021 (#33471)

* use older download_data function

Co-authored-by: Jacek Czaja <jacek.czaja@intel.com>
  • Loading branch information
lidanqing-intel and jczaja committed Jun 16, 2021
1 parent e5bd7eb commit 5c68e79
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 21 deletions.
12 changes: 6 additions & 6 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,6 @@ void AnalysisPredictor::MkldnnPreSet(
platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id(
platform::MKLDNNDeviceContextThreadLocals::
kMKLDNNSessionID_CacheClearing);
platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(
config_.mkldnn_cache_capacity_);
// Set current_input_shape for caching dynamic shape.
std::stringstream ss;
for (size_t i = 0; i < inputs_shape.size(); ++i) {
Expand All @@ -355,6 +353,9 @@ void AnalysisPredictor::MkldnnPreSet(
VLOG(2) << "Set input shape=" << ss.str();
platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(ss.str());
}
platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(
config_.mkldnn_cache_capacity_);

#endif
}

Expand All @@ -370,10 +371,9 @@ void AnalysisPredictor::MkldnnPostReset() {
CHECK_LE(shape_blob_size,
static_cast<size_t>(config_.mkldnn_cache_capacity_));
}
paddle::platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id(
platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default);
platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(0);
platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str("");
// We cannot reset to the default cache settings
// as there maybe CopyToCPU method used and oneDNN
// primitives are used there so cache would grow
}
#endif
}
Expand Down
9 changes: 4 additions & 5 deletions paddle/fluid/inference/tests/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
# densebox
set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc
# EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
# ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt
# --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt
--infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)

# mobilenet with transpose op
set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"

DEFINE_string(infer_shape, "", "data shape file");
DEFINE_int32(sample, 20, "number of sample");

namespace paddle {
namespace inference {
namespace analysis {

struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};

Record ProcessALine(const std::string &line, const std::string &shape_line) {
VLOG(3) << "process a line";

Record record;
std::vector<std::string> data_strs;
split(line, ' ', &data_strs);
for (auto &d : data_strs) {
record.data.push_back(std::stof(d));
}

std::vector<std::string> shape_strs;
split(shape_line, ' ', &shape_strs);
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
return record;
}

void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
// cfg->SwitchIrDebug(); // Enable to have graphs dumped
cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
}

void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &line, const std::string &shape_line) {
auto record = ProcessALine(line, shape_line);

PaddleTensor input;
input.shape = record.shape;
input.dtype = PaddleDType::FLOAT32;
size_t input_size = record.data.size() * sizeof(float);
input.data.Resize(input_size);
memcpy(input.data.data(), record.data.data(), input_size);
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}

#ifdef PADDLE_WITH_MKLDNN
int GetNumCachedObjects(void) {
auto &pool = platform::DeviceContextPool::Instance();
platform::CPUPlace place;
auto onednn_dev_ctx =
dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
return onednn_dev_ctx->GetCachedObjectsNumber();
}

void validate_cache_onednn(int cache_capacity = 1) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.EnableMKLDNN();
cfg.SetMkldnnCacheCapacity(cache_capacity);

auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
std::vector<std::vector<PaddleTensor>> ref_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;

std::ifstream file(FLAGS_infer_data);
std::ifstream infer_file(FLAGS_infer_shape);
std::vector<std::string> lines;
std::vector<std::string> shape_lines;

// Let's work with 4 samples
auto num_samples = 4;
ref_outputs.resize(num_samples);
lines.resize(num_samples);
shape_lines.resize(num_samples);

// Let's remember number of cached objects before
// execution and after every single execution
std::vector<int> cache_filling;
cache_filling.push_back(GetNumCachedObjects());

// compute sequentially prediction
for (int i = 0; i < num_samples; ++i) {
std::getline(file, lines[i]);
std::getline(infer_file, shape_lines[i]);
SetInput(&input_slots_all, lines[i], shape_lines[i]);
predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
// record number of cached objects
cache_filling.push_back(GetNumCachedObjects());
}

file.close();
infer_file.close();

// Pick first output tensor from model
// as internally reorders may be called
// so it will impact cache size
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());

// Release predictor (relevant cache should be emptied)
predictor.reset(nullptr);
cache_filling.push_back(GetNumCachedObjects());

// Compare results
// First and last value should be equal e.g. before using cache (empty) and
// after releasing executor
PADDLE_ENFORCE_EQ(
cache_filling[0], cache_filling[cache_filling.size() - 1],
platform::errors::Fatal("Cache size before execution and after "
"releasing Executor do not match"));

// Iterate to check if cache is not increasing
// over exceeding cache capacity
if (cache_capacity != 0) {
for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
PADDLE_ENFORCE_EQ(
cache_filling[cache_capacity], cache_filling[i],
platform::errors::Fatal("Cache capacity should not increase "
"after full capacity is used"));
}
}
}

TEST(Analyzer_detect, validate_cache_onednn) {
validate_cache_onednn(2 /*cache_capacity */);
}
#endif

} // namespace analysis
} // namespace inference
} // namespace paddle
31 changes: 25 additions & 6 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), p_blobmap_() {
p_blobmap_.reset(new BlobMap());
p_exec_items_.reset(new ExecMap());
p_exec_items_.reset(new ExecShape());
p_mutex_.reset(new std::mutex());
}

Expand Down Expand Up @@ -618,22 +618,40 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
if (ptr == nullptr) {
p_blobmap_->clear();
} else {
for (auto& v : (*p_exec_items_)[ptr]) {
(v.first)->erase(v.second);
// Iterate through all shapes and release
// for each shape and active executor all entries
// of this executor
for (auto& s : *p_exec_items_) {
for (auto& v : (*s.second)[ptr]) {
(v.first)->erase(v.second);
}
s.second->erase(ptr);
}
p_exec_items_->erase(ptr);
}
} else {
VLOG(3) << "Prevented Clearing DNNL cache.";
block_next_cache_clearing_ = false;
}
}

void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
p_exec_items_->erase(p_exec_items_->begin());
}

void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
KeyBlob::iterator it) const {
// Take current input shape from TLS
// Take current executor addess from TLS
// and for this executor's items add the one defined with arguments
(*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
auto key_it = p_exec_items_
->insert(std::make_pair(tls().cur_input_shape_str,
std::make_shared<ExecMap>()))
.first;
(*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));

VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
<< " curr exec size: "
<< (*key_it->second)[tls().get_curr_exec()].size() << "\n";
}

void MKLDNNDeviceContext::BlockNextCacheClearing() {
Expand Down Expand Up @@ -690,6 +708,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
VLOG(2) << "sid=" << sid
<< ", remove all blobs of shape: " << sBlob->begin()->first;
sBlob->erase(sBlob->begin()->first);
RemoveShapeEntriesWithExecutor();
}
pBlob = std::make_shared<KeyBlob>();
(*sBlob)[tls().cur_input_shape_str] = pBlob;
Expand All @@ -713,7 +732,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
return;
}

unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
unsigned int num_entries = 0;
for (auto const& l3 : *p_blobmap_) {
for (auto const& l2 : *(l3.second)) {
Expand Down
15 changes: 11 additions & 4 deletions paddle/fluid/platform/device_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -728,8 +728,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
using ShapeBlob = umap_key_string_t<KeyBlob>;
using BlobMap = umap_value_smart_t<int, ShapeBlob>;

using ExecMap = std::unordered_map<
void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
// Auxillary two-level structure (shape, executor) to easier control
// clearing cache objects related to specific executor

using ExecKey = void*;
using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
using ExecMap =
std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;

explicit MKLDNNDeviceContext(CPUPlace place);

Expand All @@ -738,6 +744,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {

// Register object to currently used executor's map
void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
void RemoveShapeEntriesWithExecutor(void) const;

// Remove all entries from the blob map
void ResetBlobMap(void* ptr);
Expand All @@ -752,7 +759,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
void SetBlob(const std::string& name, std::shared_ptr<void> data) const;

// Calculate number of oneDNN objects cached
unsigned int GetCachedObjectsNumber(void);
unsigned int GetCachedObjectsNumber(void) const;

// Find a saved blob. Return nullptr if not found
std::shared_ptr<void> GetBlob(const std::string& name) const;
Expand All @@ -765,7 +772,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
std::shared_ptr<BlobMap> p_blobmap_;
// Map key is pointer of executor and value is a data(iterator in map) needed
// to erase
std::shared_ptr<ExecMap> p_exec_items_;
std::shared_ptr<ExecShape> p_exec_items_;
std::shared_ptr<std::mutex> p_mutex_;
bool block_next_cache_clearing_ = false;
};
Expand Down

0 comments on commit 5c68e79

Please sign in to comment.