Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conv cudnn 3d #5783

Merged
merged 12 commits into from
Nov 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions paddle/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()

# conv_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "conv_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
endif()

# pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
Expand Down
41 changes: 34 additions & 7 deletions paddle/operators/conv_cudnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
namespace paddle {
namespace operators {

class CudnnConvOpMaker : public Conv2DOpMaker {
class CudnnConv2DOpMaker : public Conv2DOpMaker {
public:
CudnnConvOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
CudnnConv2DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: Conv2DOpMaker(proto, op_checker) {
AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, "
Expand All @@ -32,16 +32,43 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
}
};

class CudnnConv3DOpMaker : public Conv3DOpMaker {
public:
CudnnConv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: Conv3DOpMaker(proto, op_checker) {
AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.")
.SetDefault(4096);
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::ConvOpGrad);
REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
conv2d_cudnn_grad, ops::ConvOpGrad);

REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
conv3d_cudnn_grad, ops::ConvOpGrad);

REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_cudnn_grad,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);

REGISTER_OP_CPU_KERNEL(conv_cudnn,
REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv_cudnn_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
conv3d_cudnn_grad,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
129 changes: 94 additions & 35 deletions paddle/operators/conv_cudnn_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,56 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
if (input->dims().size() == 5) {
layout = DataLayout::kNCDHW;
}

cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);

#if CUDNN_VERSION_MIN(7, 0, 0)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
cudnn_conv_desc, groups));
groups = 1;
#endif

cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()), groups);
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output->dims()), groups);
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups);
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);

int input_channels = input->dims()[1];
int input_height = input->dims()[2];
int input_width = input->dims()[3];
int output_channels = output->dims()[1];
int output_height = output->dims()[2];
int output_width = output->dims()[3];
int input_height, input_width, input_depth;
if (input->dims().size() == 5) {
input_depth = input->dims()[2];
input_height = input->dims()[3];
input_width = input->dims()[4];
} else { // dim size is enforced in InferShape
input_depth = 1;
input_height = input->dims()[2];
input_width = input->dims()[3];
}
int output_channels = filter->dims()[0];
int output_height, output_width, output_depth;
if (output->dims().size() == 5) {
output_depth = output->dims()[2];
output_height = output->dims()[3];
output_width = output->dims()[4];
} else {
output_depth = 1;
output_height = output->dims()[2];
output_width = output->dims()[3];
}

int group_offset_in = input_channels / groups * input_height * input_width;
int group_offset_in =
input_channels / groups * input_height * input_width * input_depth;
int group_offset_out =
output_channels / groups * output_height * output_width;
output_channels / groups * output_height * output_width * output_depth;
int group_offset_filter = filter->numel() / groups;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's simpler to write this ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to http://www.cplusplus.com/reference/vector/vector/erase/

Because vectors use an array as their underlying storage, erasing elements in positions other than the vector end causes the container to relocate all the elements after the segment erased to their new positions.

Erasing first two elements will cause memory re-allocation, which is not efficient.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

group is supported in cudnn7.0 .

// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
Expand Down Expand Up @@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_grad_desc;
ScopedTensorDescriptor input_grad_desc;

ScopedFilterDescriptor filter_desc;
ScopedFilterDescriptor filter_grad_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
if (input->dims().size() == 5) {
layout = DataLayout::kNCDHW;
}

cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);

#if CUDNN_VERSION_MIN(7, 0, 0)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
cudnn_conv_desc, groups));
groups = 1;
#endif

cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()), groups);
Expand All @@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
layout, framework::vectorize2int(output_grad->dims()), groups);
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups);
cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;

cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);

int input_channels = input->dims()[1];
int input_height = input->dims()[2];
int input_width = input->dims()[3];
int input_height, input_width, input_depth;
if (input->dims().size() == 5) {
input_depth = input->dims()[2];
input_height = input->dims()[3];
input_width = input->dims()[4];
} else { // dim size is enforced in InferShape
input_depth = 1;
input_height = input->dims()[2];
input_width = input->dims()[3];
}

int output_grad_channels = filter->dims()[0];
int output_grad_height = output_grad->dims()[2];
int output_grad_width = output_grad->dims()[3];
int output_grad_height, output_grad_width, output_grad_depth;
if (input->dims().size() == 5) {
output_grad_depth = output_grad->dims()[2];
output_grad_height = output_grad->dims()[3];
output_grad_width = output_grad->dims()[4];
} else {
output_grad_depth = 1;
output_grad_height = output_grad->dims()[2];
output_grad_width = output_grad->dims()[3];
}

int group_offset_in = input_channels / groups * input_height * input_width;
int group_offset_out =
output_grad_channels / groups * output_grad_height * output_grad_width;
int group_offset_in =
input_channels / groups * input_height * input_width * input_depth;
int group_offset_out = output_grad_channels / groups * output_grad_height *
output_grad_width * output_grad_depth;
int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn backward algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t data_algo;
Expand All @@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {

auto handle = ctx.cuda_device_context().cudnn_handle();
if (input_grad) {
cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
layout, framework::vectorize2int(input_grad->dims()), groups);
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
Expand All @@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
cudnn_output_grad_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_input_grad_desc,
cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}

if (filter_grad) {
cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
layout, framework::vectorize2int(filter_grad->dims()), groups);
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
Expand All @@ -222,7 +275,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
Expand All @@ -233,21 +285,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + i * group_offset_in));
}
}
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad.

for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
cudnn_conv_desc, filter_algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + i * group_offset_filter));
}
}
Expand All @@ -259,8 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle

REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>,
REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>);
REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
paddle::operators::CudnnConvGradOpKernel<float>,
paddle::operators::CudnnConvGradOpKernel<double>);

REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>);
REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
paddle::operators::CudnnConvGradOpKernel<float>,
paddle::operators::CudnnConvGradOpKernel<double>);
5 changes: 2 additions & 3 deletions paddle/platform/cudnn_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
case DataLayout::kNCHW:
return CUDNN_TENSOR_NCHW;
case DataLayout::kNCDHW:
return CUDNN_TENSOR_NCHW; // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same
default:
PADDLE_THROW("Unknown cudnn equivalent for order");
}
Expand All @@ -143,7 +143,7 @@ class ScopedTensorDescriptor {
strides[i] = dims[i + 1] * strides[i + 1];
}
// Update tensor descriptor dims setting if groups > 1
// FIXME(typhoonzero): Assume using NCHW or NCDHW order
// NOTE: Assume using NCHW or NCDHW order
std::vector<int> dims_with_group(dims.begin(), dims.end()); // copy
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
Expand Down Expand Up @@ -186,7 +186,6 @@ class ScopedFilterDescriptor {
// width of the filter.
std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
if (groups > 1) {
// M /= groups
kernel_with_group[0] /= groups;
// NOTE: input filter(C) of the filter is already asserted to be C/groups.
}
Expand Down
14 changes: 7 additions & 7 deletions python/paddle/v2/fluid/tests/test_conv2d_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def conv2d_forward_naive(input, filter, group, conv_param):
out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
out = np.zeros((in_n, out_c, out_h, out_w))

d_bolck_w = (dilation[0] * (f_h - 1) + 1)
d_bolck_h = (dilation[1] * (f_w - 1) + 1)
d_bolck_h = (dilation[0] * (f_h - 1) + 1)
d_bolck_w = (dilation[1] * (f_w - 1) + 1)

input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
mode='constant',
Expand Down Expand Up @@ -167,27 +167,27 @@ def init_group(self):
#----------------Conv2dCudnn----------------
class TestCudnn(TestConv2dOp):
def init_op_type(self):
self.op_type = "conv_cudnn"
self.op_type = "conv2d_cudnn"


class TestCudnnWithPad(TestWithPad):
def init_op_type(self):
self.op_type = "conv_cudnn"
self.op_type = "conv2d_cudnn"


class TestCudnnWithStride(TestWithStride):
def init_op_type(self):
self.op_type = "conv_cudnn"
self.op_type = "conv2d_cudnn"


class TestCudnnWithGroup(TestWithGroup):
def init_op_type(self):
self.op_type = "conv_cudnn"
self.op_type = "conv2d_cudnn"


class TestCudnnWith1x1(TestWith1x1):
def init_op_type(self):
self.op_type = "conv_cudnn"
self.op_type = "conv2d_cudnn"


# cudnn v5 does not support dilation conv.
Expand Down
Loading