-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conv cudnn 3d #5783
Merged
Merged
Conv cudnn 3d #5783
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
d5e327a
conv cudnn 3d
typhoonzero 86a0c99
update test case
typhoonzero 754be4d
update
typhoonzero f03915e
update
typhoonzero 6cc4cb3
follow comments and remove groups from helper
typhoonzero 1fac167
update
typhoonzero 7f0ba5a
refine
typhoonzero 946061f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
typhoonzero 7e89c41
update
typhoonzero 91a2a81
follow comments2
typhoonzero 3ae6646
update
typhoonzero 419d6c8
fix compile
typhoonzero File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,26 +56,56 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { | |
ScopedFilterDescriptor filter_desc; | ||
ScopedConvolutionDescriptor conv_desc; | ||
DataLayout layout = DataLayout::kNCHW; | ||
if (input->dims().size() == 5) { | ||
layout = DataLayout::kNCDHW; | ||
} | ||
|
||
cudnnConvolutionDescriptor_t cudnn_conv_desc = | ||
conv_desc.descriptor<T>(paddings, strides, dilations); | ||
|
||
#if CUDNN_VERSION_MIN(7, 0, 0) | ||
// cudnn 7 can support groups, no need to do it mannually | ||
// FIXME(typhoonzero): find a better way to disable groups | ||
// rather than setting it to 1. | ||
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( | ||
cudnn_conv_desc, groups)); | ||
groups = 1; | ||
#endif | ||
|
||
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( | ||
layout, framework::vectorize2int(input->dims()), groups); | ||
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>( | ||
layout, framework::vectorize2int(output->dims()), groups); | ||
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( | ||
layout, framework::vectorize2int(filter->dims()), groups); | ||
cudnnConvolutionDescriptor_t cudnn_conv_desc = | ||
conv_desc.descriptor<T>(paddings, strides, dilations); | ||
|
||
int input_channels = input->dims()[1]; | ||
int input_height = input->dims()[2]; | ||
int input_width = input->dims()[3]; | ||
int output_channels = output->dims()[1]; | ||
int output_height = output->dims()[2]; | ||
int output_width = output->dims()[3]; | ||
int input_height, input_width, input_depth; | ||
if (input->dims().size() == 5) { | ||
input_depth = input->dims()[2]; | ||
input_height = input->dims()[3]; | ||
input_width = input->dims()[4]; | ||
} else { // dim size is enforced in InferShape | ||
input_depth = 1; | ||
input_height = input->dims()[2]; | ||
input_width = input->dims()[3]; | ||
} | ||
int output_channels = filter->dims()[0]; | ||
int output_height, output_width, output_depth; | ||
if (output->dims().size() == 5) { | ||
output_depth = output->dims()[2]; | ||
output_height = output->dims()[3]; | ||
output_width = output->dims()[4]; | ||
} else { | ||
output_depth = 1; | ||
output_height = output->dims()[2]; | ||
output_width = output->dims()[3]; | ||
} | ||
|
||
int group_offset_in = input_channels / groups * input_height * input_width; | ||
int group_offset_in = | ||
input_channels / groups * input_height * input_width * input_depth; | ||
int group_offset_out = | ||
output_channels / groups * output_height * output_width; | ||
output_channels / groups * output_height * output_width * output_depth; | ||
int group_offset_filter = filter->numel() / groups; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
// ------------------- cudnn conv workspace --------------------- | ||
void* cudnn_workspace = nullptr; | ||
|
@@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
// ------------------- cudnn descriptors --------------------- | ||
ScopedTensorDescriptor input_desc; | ||
ScopedTensorDescriptor output_grad_desc; | ||
ScopedTensorDescriptor input_grad_desc; | ||
|
||
ScopedFilterDescriptor filter_desc; | ||
ScopedFilterDescriptor filter_grad_desc; | ||
ScopedConvolutionDescriptor conv_desc; | ||
DataLayout layout = DataLayout::kNCHW; | ||
if (input->dims().size() == 5) { | ||
layout = DataLayout::kNCDHW; | ||
} | ||
|
||
cudnnConvolutionDescriptor_t cudnn_conv_desc = | ||
conv_desc.descriptor<T>(paddings, strides, dilations); | ||
|
||
#if CUDNN_VERSION_MIN(7, 0, 0) | ||
// cudnn 7 can support groups, no need to do it mannually | ||
// FIXME(typhoonzero): find a better way to disable groups | ||
// rather than setting it to 1. | ||
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( | ||
cudnn_conv_desc, groups)); | ||
groups = 1; | ||
#endif | ||
|
||
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( | ||
layout, framework::vectorize2int(input->dims()), groups); | ||
|
@@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
layout, framework::vectorize2int(output_grad->dims()), groups); | ||
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( | ||
layout, framework::vectorize2int(filter->dims()), groups); | ||
cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr; | ||
cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr; | ||
|
||
cudnnConvolutionDescriptor_t cudnn_conv_desc = | ||
conv_desc.descriptor<T>(paddings, strides, dilations); | ||
|
||
int input_channels = input->dims()[1]; | ||
int input_height = input->dims()[2]; | ||
int input_width = input->dims()[3]; | ||
int input_height, input_width, input_depth; | ||
if (input->dims().size() == 5) { | ||
input_depth = input->dims()[2]; | ||
input_height = input->dims()[3]; | ||
input_width = input->dims()[4]; | ||
} else { // dim size is enforced in InferShape | ||
input_depth = 1; | ||
input_height = input->dims()[2]; | ||
input_width = input->dims()[3]; | ||
} | ||
|
||
int output_grad_channels = filter->dims()[0]; | ||
int output_grad_height = output_grad->dims()[2]; | ||
int output_grad_width = output_grad->dims()[3]; | ||
int output_grad_height, output_grad_width, output_grad_depth; | ||
if (input->dims().size() == 5) { | ||
output_grad_depth = output_grad->dims()[2]; | ||
output_grad_height = output_grad->dims()[3]; | ||
output_grad_width = output_grad->dims()[4]; | ||
} else { | ||
output_grad_depth = 1; | ||
output_grad_height = output_grad->dims()[2]; | ||
output_grad_width = output_grad->dims()[3]; | ||
} | ||
|
||
int group_offset_in = input_channels / groups * input_height * input_width; | ||
int group_offset_out = | ||
output_grad_channels / groups * output_grad_height * output_grad_width; | ||
int group_offset_in = | ||
input_channels / groups * input_height * input_width * input_depth; | ||
int group_offset_out = output_grad_channels / groups * output_grad_height * | ||
output_grad_width * output_grad_depth; | ||
int group_offset_filter = filter->numel() / groups; | ||
// ------------------- cudnn backward algorithm --------------------- | ||
cudnnConvolutionBwdDataAlgo_t data_algo; | ||
|
@@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
|
||
auto handle = ctx.cuda_device_context().cudnn_handle(); | ||
if (input_grad) { | ||
cudnn_input_grad_desc = input_grad_desc.descriptor<T>( | ||
layout, framework::vectorize2int(input_grad->dims()), groups); | ||
PADDLE_ENFORCE( | ||
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( | ||
handle, cudnn_filter_desc, | ||
|
@@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
cudnn_output_grad_desc, cudnn_conv_desc, | ||
// dxDesc: Handle to the previously initialized output tensor | ||
// descriptor. | ||
cudnn_input_grad_desc, | ||
cudnn_input_desc, | ||
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, | ||
workspace_size_limit, &data_algo)); | ||
PADDLE_ENFORCE( | ||
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( | ||
handle, cudnn_filter_desc, cudnn_output_grad_desc, | ||
cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size)); | ||
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); | ||
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); | ||
} | ||
|
||
if (filter_grad) { | ||
cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>( | ||
layout, framework::vectorize2int(filter_grad->dims()), groups); | ||
PADDLE_ENFORCE( | ||
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( | ||
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, | ||
|
@@ -222,7 +275,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); | ||
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); | ||
// ------------------- cudnn conv backward data --------------------- | ||
// FIXME(typhoonzero): template type T may not be the same as cudnn call. | ||
T alpha = 1.0f, beta = 0.0f; | ||
if (input_grad) { | ||
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); | ||
|
@@ -233,21 +285,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
handle, &alpha, cudnn_filter_desc, | ||
filter_data + i * group_offset_filter, cudnn_output_grad_desc, | ||
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, | ||
cudnn_workspace, workspace_size_in_bytes, &beta, | ||
cudnn_input_grad_desc, input_grad_data + i * group_offset_in)); | ||
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, | ||
input_grad_data + i * group_offset_in)); | ||
} | ||
} | ||
// ------------------- cudnn conv backward filter --------------------- | ||
if (filter_grad) { | ||
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); | ||
// Because beta is zero, it is unnecessary to reset filter_grad. | ||
|
||
for (int i = 0; i < groups; i++) { | ||
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( | ||
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, | ||
cudnn_output_grad_desc, output_grad_data + i * group_offset_out, | ||
cudnn_conv_desc, filter_algo, cudnn_workspace, | ||
workspace_size_in_bytes, &beta, cudnn_filter_grad_desc, | ||
workspace_size_in_bytes, &beta, cudnn_filter_desc, | ||
filter_grad_data + i * group_offset_filter)); | ||
} | ||
} | ||
|
@@ -259,8 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { | |
} // namespace operators | ||
} // namespace paddle | ||
|
||
REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>, | ||
REGISTER_OP_GPU_KERNEL(conv2d_cudnn, | ||
paddle::operators::CudnnConvOpKernel<float>, | ||
paddle::operators::CudnnConvOpKernel<double>); | ||
REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad, | ||
paddle::operators::CudnnConvGradOpKernel<float>, | ||
paddle::operators::CudnnConvGradOpKernel<double>); | ||
|
||
REGISTER_OP_GPU_KERNEL(conv3d_cudnn, | ||
paddle::operators::CudnnConvOpKernel<float>, | ||
paddle::operators::CudnnConvOpKernel<double>); | ||
REGISTER_OP_GPU_KERNEL(conv_cudnn_grad, | ||
REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad, | ||
paddle::operators::CudnnConvGradOpKernel<float>, | ||
paddle::operators::CudnnConvGradOpKernel<double>); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it's simpler to write this ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to http://www.cplusplus.com/reference/vector/vector/erase/
Erasing first two elements will cause memory re-allocation, which is not efficient.