Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hybrid] seed and dropout op support force-cpu #35820

Merged
merged 11 commits into from
Sep 28, 2021
3 changes: 3 additions & 0 deletions paddle/fluid/operators/dropout_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor);
seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
increment = offset;
} else if (seed && platform::is_cpu_place(seed->place())) {
seed_data = *(seed->data<int>());
increment = offset;
} else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
auto seed_offset = gen_cuda->IncrementOffset(offset);
seed_data = seed_offset.first;
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/operators/dropout_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}

framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "Seed") {
VLOG(10) << "var_name:" << var_name
<< " does not need to transform in dropout op";
return expected_kernel_type;
}

return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
};

class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
Expand Down
18 changes: 18 additions & 0 deletions paddle/fluid/operators/seed_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
AddOutput("Out", "The output of seed op.");
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddAttr<bool>("force_cpu",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看下这个op预测会不会用,可能需要加上 AddCheckpoint 保证预测的兼容性

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已加上AddCheckpoint

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果预测不需要是不是还得加AsExtra(),新出的规范

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已经确认,并已加上AsExtra()

"(bool, default false) Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running "
"device")
.SetDefault(false)
.AsExtra();
AddComment(R"DOC(
Seed Operator.
)DOC");
Expand All @@ -55,3 +61,15 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>);

/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(seed)
.AddCheckpoint(
R"ROC(
Upgrade seed add a new attribute [force_cpu])ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"force_cpu",
"If true, Force fill output variable to cpu."
"memory. Otherwise, fill output variable to the running "
"device",
false));
30 changes: 22 additions & 8 deletions paddle/fluid/operators/seed_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/seed_op.h"

namespace paddle {
Expand All @@ -20,22 +21,35 @@ namespace operators {
template <typename Place, typename T>
class GPUSeedKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* out = context.Output<Tensor>("Out");
auto* out_data = out->mutable_data<T>(context.GetPlace());
void Compute(const framework::ExecutionContext &context) const override {
auto *out = context.Output<Tensor>("Out");
int user_seed = context.Attr<int>("seed");
auto force_cpu = context.Attr<bool>("force_cpu");
std::random_device rnd;
int seed;
if (user_seed != 0) {
seed = user_seed;
} else {
seed = rnd();
}
auto target_gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
auto stream = context.cuda_device_context().stream();
memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed,
sizeof(int), stream);

bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace();
if (cpu_place) {
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(context.GetPlace());
out->mutable_data<T>(platform::CPUPlace());
math::SetConstant<platform::CPUDeviceContext, T> functor;
functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
out, static_cast<T>(seed));
} else {
auto *out_data = out->mutable_data<T>(context.GetPlace());
auto target_gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
auto stream = context.cuda_device_context().stream();
memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed,
sizeof(int), stream);
}
}
};

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/operators/seed_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#pragma once

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"

namespace paddle {
namespace operators {
Expand Down
9 changes: 7 additions & 2 deletions python/paddle/fluid/backward.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,13 +197,18 @@ def modify_forward_desc_for_recompute(self):
if op.desc.has_attr(op_device_attr_name):
op_device = op.desc.attr(op_device_attr_name)

# Setting the force_cpu of seed to true will make the output of seed in cpu memory,
# reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang
added_op = self.block._insert_op(
index=op.idx,
type='seed',
inputs={},
outputs={'Out': [added_var]},
attrs={'seed': seed,
'op_device': op_device})
attrs={
'seed': seed,
'op_device': op_device,
'force_cpu': True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

加一点点注释,为啥设置为True

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已加注释

})
self.ops.insert(op_idx, added_op)
# modify dropout op desc so that it accept a seed var as input
op.desc.set_input("Seed", [var_unique_name])
Expand Down
69 changes: 69 additions & 0 deletions python/paddle/fluid/tests/unittests/test_dropout_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,75 @@ def init_test_case(self):
self.fix_seed = False


class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
def test_seed_cpu_place(self):
paddle.enable_static()
main_program = Program()
with program_guard(main_program):
seed_input_name = "tensor@SeedInput"
x_var_name = "tensor@X"
x_out_var = "tensor@XOut"

mask_var_name = "tensor@Mask"
seed_input_var = main_program.global_block().create_var(
name=seed_input_name,
shape=[1],
dtype='int32',
persistable=False,
stop_gradient=True)
x_out_var = main_program.global_block().create_var(
name=x_out_var,
shape=[40, 40],
dtype='float32',
persistable=False,
stop_gradient=True)
x_var = main_program.global_block().create_var(
name=x_var_name,
shape=[40, 40],
dtype='float32',
persistable=False,
stop_gradient=True)
mask_var = main_program.global_block().create_var(
name=mask_var_name,
shape=[1],
dtype='int',
persistable=False,
stop_gradient=True)

main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": x_var_name},
attrs={
"shape": [40, 40],
"dtype": x_var.dtype,
"value": 1.0,
"place_type": 0
})
main_program.global_block().append_op(
type='seed',
inputs={},
outputs={'Out': seed_input_var},
attrs={'seed': 1,
'force_cpu': True})
main_program.global_block().append_op(
type='dropout',
inputs={'X': x_var,
'Seed': seed_input_var},
attrs={'dropout_prob': 0.},
outputs={'Out': x_out_var,
'Mask': mask_var})
place = fluid.CPUPlace()
if core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
x_out, mask_out = exe.run(
main_program,
feed={},
fetch_list=[x_out_var.name, mask_var.name])
x_in_np = np.ones([40, 40]).astype("float32")
self.assertTrue(np.allclose(x_out, x_in_np))


class TestDropoutOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
Expand Down
4 changes: 2 additions & 2 deletions python/paddle/fluid/tests/unittests/test_seed_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def setUp(self):
self.op_type = "seed"
self.inputs = {}
self.attrs = {"seed": 123}
self.outputs = {"Out": np.asarray((123)).astype('int32')}
self.outputs = {"Out": np.asarray((123)).astype('int')}

def test_check_output(self):
self.check_output()
Expand All @@ -36,7 +36,7 @@ def setUp(self):
self.op_type = "seed"
self.inputs = {}
self.attrs = {"seed": 0}
self.outputs = {"Out": np.asarray((123)).astype('int32')}
self.outputs = {"Out": np.asarray((123)).astype('int')}

def test_check_output(self):
self.check_output(no_check_set=["Out"])
Expand Down