Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding out of memory handler, making arena bins more detailed #73

Merged
merged 1 commit into from
Nov 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
326 changes: 168 additions & 158 deletions 3rdparty/cub/cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,9 @@ struct CachingDeviceAllocator
unsigned int max_bin, ///< Maximum bin
size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device
bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called.
bool debug = false ///< Whether or not to print (de)allocation events to stdout
)
:
bool debug = false ///< Whether or not to print (de)allocation events to stdout
)
:
spin_lock(0),
bin_growth(bin_growth),
min_bin(min_bin),
Expand Down Expand Up @@ -302,8 +302,8 @@ struct CachingDeviceAllocator
* sets a maximum of 6,291,455 cached bytes per device
*/
CachingDeviceAllocator(
bool skip_cleanup = false,
bool debug = false)
bool skip_cleanup = false,
bool debug = false)
:
spin_lock(0),
bin_growth(8),
Expand Down Expand Up @@ -368,106 +368,123 @@ struct CachingDeviceAllocator

*d_ptr = NULL;

int entrypoint_device = INVALID_DEVICE_ORDINAL;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;

do {

if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (device == INVALID_DEVICE_ORDINAL)
device = entrypoint_device;

// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}

// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}

BlockDescriptor search_key(bin_bytes, bin, device, active_stream);

// Lock
Lock(&spin_lock);

// Find the range of freed blocks big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

// Look for freed blocks from the active stream or from other idle streams
bool found = false;
while ((block_itr != cached_blocks.end()) &&
(block_itr->device == device) &&
(block_itr->bin == search_key.bin))
{
cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
{
// Reuse existing cache block. Insert into live blocks.
found = true;
search_key = *block_itr;
search_key.associated_stream = active_stream;
live_blocks.insert(search_key);

// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device].free -= search_key.bytes;
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream);

break;
}

block_itr++;
}

if (device == INVALID_DEVICE_ORDINAL) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
device = entrypoint_device;
}

// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}

if (!found)
{
Unlock(&spin_lock);
// Set to specified device
if (device != entrypoint_device) {
if (CubDebug(error = cudaSetDevice(device))) break;
}

// Allocate
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) break;


Lock(&spin_lock);
// Insert into live blocks
live_blocks.insert(search_key);
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
}
// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}

// Copy device pointer to output parameter
*d_ptr = search_key.d_ptr;
if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);

} while(0);
BlockDescriptor search_key(bin_bytes, bin, device, active_stream);

Unlock(&spin_lock);
// Lock while we search
Lock(&spin_lock);

// Attempt to revert back to previous device if necessary
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
// Find the range of freed blocks big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

// Look for freed blocks from the active stream or from other idle streams
bool found = false;
while ( (block_itr != cached_blocks.end())
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin)) {
cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream)
|| (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
// Reuse existing cache block. Insert into live blocks.
found = true;
search_key = *block_itr;
search_key.associated_stream = active_stream;
live_blocks.insert(search_key);

// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device].free -= search_key.bytes;
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream);

break;
}

block_itr++;
}
// done searching. Unlock.

Unlock(&spin_lock);

if (!found)
{

// Set to specified device. Entrypoint may not be set.
if (device != entrypoint_device) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Allocate
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);

if (error != cudaSuccess) {
if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
device, (long long) search_key.bytes, (long long) search_key.associated_stream);

// if (search_key.bytes < cached_bytes[device]) {
// free all cached memory (for all devices), synchrionize and retry once
cudaDeviceSynchronize();
cudaThreadSynchronize();
FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
// }
}
if (CubDebug(error))
return error;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;

// Insert into live blocks
Lock(&spin_lock);
live_blocks.insert(search_key);
cached_bytes[device].busy += search_key.bytes;
Unlock(&spin_lock);

if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);

// Attempt to revert back to previous device if necessary
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
}

// Copy device pointer to output parameter
*d_ptr = search_key.d_ptr;
if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);

return error;

Expand Down Expand Up @@ -515,73 +532,66 @@ struct CachingDeviceAllocator
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;

if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
if (device == INVALID_DEVICE_ORDINAL) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
device = entrypoint_device;
}

if (device == INVALID_DEVICE_ORDINAL)
device = entrypoint_device;

BlockDescriptor search_key(d_ptr, device);

do {

// Set to specified device
if (device != entrypoint_device) {
if (CubDebug(error = cudaSetDevice(device))) break;
}

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr == live_blocks.end())
{
// Cannot find pointer
if (CubDebug(error = cudaErrorUnknown)) break;
}
else
{
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);

cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
// Signal the event in the associated stream
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) break;

// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;

if (debug) CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
else {
// This means : actually delete the block after we release the lock
d_ptr = 0;
}
}
} while (0);
bool recached = false;

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end()) {
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
recached = true;
if (debug) {
CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
}
}

Unlock(&spin_lock);

if (!d_ptr)
{
// Free device memory
if (recached) {
// Signal the event in the associated stream
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
return error;
} else {
// Set to specified device. Entrypoint may not be set.
if (device != entrypoint_device) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Actually free device memory
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

if (debug) CubLog("\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;

if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
}

return error;
Expand Down Expand Up @@ -654,7 +664,7 @@ struct CachingDeviceAllocator
if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
}

Unlock(&spin_lock);

// Attempt to revert back to entry-point device if necessary
Expand Down
6 changes: 3 additions & 3 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ namespace caffe {
// if you are paranoid, that doesn't mean they are not after you :)
delete cubAlloc;

cubAlloc = new cub::CachingDeviceAllocator( 8, // defaults
3,
7,
cubAlloc = new cub::CachingDeviceAllocator( 2, // defaults
6,
16,
poolsize_,
false,
debug_);
Expand Down