Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine auto_growth allocator #35732

Merged
merged 3 commits into from
Oct 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/fluid/memory/allocation/aligned_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace paddle {
namespace memory {
namespace allocation {

// For memory address alignment
class AlignedAllocation : public Allocation {
public:
AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
Expand Down
36 changes: 35 additions & 1 deletion paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
Expand Down Expand Up @@ -201,6 +202,8 @@ class AllocatorFacadePrivate {

inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) {
VLOG(4) << "GetAllocator"
<< " " << place << " " << size;
const auto& allocators =
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: GetAllocatorMap())
Expand Down Expand Up @@ -256,8 +259,39 @@ class AllocatorFacadePrivate {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
// API in that case may got cuda error(3), i.e.,
// cudaErrorInitializationError. And, the CUDAAllocator is only initialized
// but not really used.
// Here, the try-catch block is added to handle the case that
// GetDeviceProperties() may failed in the multiple process(for example, in
// dataloader with num_worker > 0)
try {
const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
need_addr_align = prop.textureAlignment < alignment;
VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
<< prop.textureAlignment
<< ", set need_addr_align=" << need_addr_align;
} catch (...) {
need_addr_align = true;
VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
}
// The address returned is aligned already,
// ref:
// https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
std::shared_ptr<Allocator> underlying_allocator{nullptr};
if (need_addr_align) {
VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
underlying_allocator =
std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
} else {
VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
underlying_allocator = cuda_allocator;
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
underlying_allocator, alignment, 0, allow_free_idle_chunk);
}
#endif

Expand Down
15 changes: 9 additions & 6 deletions paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ namespace allocation {
AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size, bool allow_free_idle_chunk)
: underlying_allocator_(
std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
: underlying_allocator_(underlying_allocator),
alignment_(alignment),
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
allow_free_idle_chunk_(allow_free_idle_chunk) {}

Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, alignment_);
Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
size_t size = AlignedSize(unaligned_size, alignment_);
VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;

std::lock_guard<SpinLock> guard(spinlock_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
Expand All @@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
free_blocks_.erase(iter);
auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << ", remaining " << remaining_size;
if (remaining_size == 0) {
block_it->is_free_ = false;
} else {
Expand Down Expand Up @@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
}
blocks.emplace_back(p + remaining_size, size, false, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
<< remaining_size;
VLOG(2) << "Not found and reallocate " << realloc_size << "("
<< static_cast<void *>(p) << "), and remaining " << remaining_size;
}
return new BlockAllocation(block_it);
}

void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
VLOG(10) << "Free " << allocation->size() << " bytes";
std::lock_guard<SpinLock> guard(spinlock_);
auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"

#include <cstdlib>

#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"

#include "gtest/gtest.h"

DECLARE_bool(free_idle_chunk);
Expand Down Expand Up @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
FLAGS_free_idle_chunk = free_idle_chunk;
FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
auto recorded_allocator = std::make_shared<RecordedAllocator>();

size_t alignment = 4096;
size_t memory_size = 8192;
auto underlying_allocator =
std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
recorded_allocator, alignment);
underlying_allocator, alignment);

for (size_t i = 0; i < 10; ++i) {
auto allocation = ag_allocator->Allocate(memory_size);
Expand Down Expand Up @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {

auto underlying_allocator =
std::make_shared<LimitedResourceAllocator>(memory_capacity);
auto aligned_allocator =
std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment);
aligned_allocator, alignment);

ag_allocator->Allocate(allocate_size[0]);
ASSERT_EQ(underlying_allocator->AllocatedSize(),
Expand Down