From 2a5aa2eea5aa307960db5336fe154bb95b4b0fb5 Mon Sep 17 00:00:00 2001 From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:34:04 +0800 Subject: [PATCH 01/39] [Semi-Auto] Adapt reduction rule to phi (#57241) * adapt reduction spmd rule to phi * remove useless comments --- .../spmd_rules/reduction_spmd_rule.cc | 191 ------------------ .../spmd_rules/reduction_spmd_rule.h | 46 ----- .../auto_parallel/spmd_rules/rules.h | 13 -- paddle/phi/core/attribute.h | 5 +- .../auto_parallel/inferspmd_utils.cc | 20 +- .../auto_parallel/inferspmd_utils.h | 16 ++ paddle/phi/infermeta/spmd_rules/reduction.cc | 178 ++++++++++++++++ paddle/phi/infermeta/spmd_rules/reduction.h | 35 ++++ paddle/phi/infermeta/spmd_rules/rules.h | 49 +++++ .../spmd_rules/test_reduction_rule.py | 116 ++++++++--- 10 files changed, 389 insertions(+), 280 deletions(-) delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.cc create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.h diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc deleted file mode 100644 index 62940545e8845..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc +++ /dev/null @@ -1,191 +0,0 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h" -#include -#include "paddle/phi/core/distributed/auto_parallel/utils.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -using phi::distributed::auto_parallel::str_join; - -std::string ReductionSPMDRule::GetOutputNotation( - int64_t input_ndim, - const std::string& input_axes, - const paddle::framework::AttributeMap& attrs) { - bool keep_dim = ExtractAttr("keep_dim", attrs); - std::vector reduce_dims = - ExtractAttr>("axis", attrs); - - // convert the negative dim value to normal dim value - for (auto& reduce_dim : reduce_dims) { - if (reduce_dim < 0) { - reduce_dim = input_ndim + reduce_dim; - } - } - - std::string output_axes = ""; - for (int64_t i = 0; i < input_ndim; i++) { - std::vector::iterator iter = - std::find(reduce_dims.begin(), reduce_dims.end(), i); - if (iter != reduce_dims.end()) { - // if i is reduce dim, the corresponding input axis - // will not be appended at the end of output_axes - if (keep_dim) { - output_axes.append(1, '1'); - } - } else { - // otherwise, the corresponding input axis - // will be appended at the end of output_axes - output_axes.append(1, input_axes[i]); - } - } - - return output_axes; -} - -std::pair, std::vector> -ReductionSPMDRule::InferForward(const std::vector& input_specs, - const paddle::framework::AttributeMap& attrs) { - // step0: Verify Input Args Based on Elementwise Logic - int64_t ninputs = input_specs.size(); - PADDLE_ENFORCE_EQ( - ninputs, - 1, - phi::errors::InvalidArgument("The size of InputSpec in reduction must " - "be equal to 1, but got [%d].", - ninputs)); - VerifySpecs(input_specs, "reduction"); - - // step1: Build Einsum Notation - // get einsum notation for input - std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; - int64_t ndim = input_specs[0].shape().size(); - std::vector input_axes_vec; - std::string input_axes = alphabet.substr(0, ndim); - input_axes_vec.emplace_back(input_axes); - - // get einsum notation for output - std::string output_axes = GetOutputNotation(ndim, alphabet, attrs); - - // step2: Sharding Propogation - // step2.1: merge input shardings - std::vector>> axes_sharding_info; - axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs); - std::unordered_map axis_to_dim_map = - ShardingMergeForTensors(axes_sharding_info); - - // step2.2: infer output dimsmapping from merged input dimsmapping - std::vector output_dims_mapping = - GetDimsMappingForAxes(output_axes, axis_to_dim_map); - - // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with - // input dist_attr. - TensorDistAttr output_dist_attr = - CopyTensorDistAttrForOutput(input_specs[0].dist_attr()); - output_dist_attr.set_dims_mapping(output_dims_mapping); - - // step3: handle partial - // Step3.1 Output Partial - std::vector partial_on_dims = - ResoluteOutputPartialDimension(axis_to_dim_map, output_axes); - output_dist_attr.set_partial_status( - partial_on_dims /*, handle reduce_type in future */); - - std::vector output_dist_attrs; - output_dist_attrs.emplace_back(output_dist_attr); - - // Step3.2 handle input tensor partial (TODO) - // If the op is a linear op, i.e. `linearity` is true, it supports - // the input to be partial. Otherwise, the input cannot be partial - // on reduced axes, we should reshard the input when the reduced - // axes are parital. - VLOG(4) << "ReductionSPMDRule InferForward: "; - for (int64_t i = 0; i < ninputs; i++) { - VLOG(4) << "Input" << std::to_string(i) << " shape: [" - << str_join(input_specs[i].shape()) << "] " - << "src_dims_mapping: [" << str_join(input_specs[i].dims_mapping()) - << "] " - << "dst_dims_mapping: [" << str_join(input_specs[i].dims_mapping()) - << "]"; - } - VLOG(4) << "Output dims_mapping: [" + str_join(output_dims_mapping) + "] " - << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n"; - - return {{input_specs[0].dist_attr()}, output_dist_attrs}; -} - -std::pair, std::vector> -ReductionSPMDRule::InferBackward( - const std::vector& input_specs, - const std::vector& output_specs, - const paddle::framework::AttributeMap& attrs) { - // step0: Verify Input Args Based on Elementwise Logic - int64_t ninputs = input_specs.size(); - int64_t noutputs = output_specs.size(); - PADDLE_ENFORCE_EQ( - ninputs, - 1, - phi::errors::InvalidArgument("The size of InputSpec in reduction must " - "be equal to 1, but got [%d].", - ninputs)); - PADDLE_ENFORCE_EQ( - noutputs, - 1, - phi::errors::InvalidArgument("The size of OutputSpec in reduction must " - "be equal to 1, but got [%d].", - ninputs)); - VerifySpecs(output_specs, "reduction_backward"); - - // step1: Build Einsum Notation - // get einsum notation for input - std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; - int64_t ndim = input_specs[0].shape().size(); - std::string input_axes = alphabet.substr(0, ndim); - - // get einsum notation for output - std::string output_axes = GetOutputNotation(ndim, alphabet, attrs); - - // step2: Sharding Propogation - std::unordered_map axis_to_dim_map = - ShardingMergeForTensors({{output_axes, output_specs[0].dims_mapping()}}); - - // step2.2: infer input dims mapping from output dims mapping - std::vector input_dims_mapping = - GetDimsMappingForAxes(input_axes, axis_to_dim_map, true); - - // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with - // input dist_attr. - TensorDistAttr input_dist_attr(input_specs[0].dist_attr()); - input_dist_attr.set_dims_mapping(input_dims_mapping); - - // step3: handle partial (TODO) - - VLOG(4) << "ReductionSPMDRule InferBackward: "; - VLOG(4) << "Output shape:[" << str_join(output_specs[0].shape()) - << "] dims_mapping: [" << str_join(output_specs[0].dims_mapping()) - << "]"; - VLOG(4) << "Input0: " - << " shape: [" << str_join(input_specs[0].shape()) << "] " - << "dims_mapping: [" << str_join(input_dist_attr.dims_mapping()) - << "]"; - - return {{input_dist_attr}, {output_specs[0].dist_attr()}}; -} - -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h deleted file mode 100644 index 36e412b704927..0000000000000 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" - -namespace paddle { -namespace distributed { -namespace auto_parallel { - -class ReductionSPMDRule : public SPMDRuleBase { - public: - std::pair, std::vector> - InferForward(const std::vector& input_specs, - const paddle::framework::AttributeMap& attrs) override; - - std::pair, std::vector> - InferBackward(const std::vector& input_specs, - const std::vector& output_specs, - const paddle::framework::AttributeMap& attrs) override; - - private: - std::string GetOutputNotation(int64_t input_ndim, - const std::string& input_axes, - const paddle::framework::AttributeMap& attrs); -}; -} // namespace auto_parallel -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h index c876fa59a7034..54ae4325b8a15 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h @@ -18,7 +18,6 @@ #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h" -#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h" @@ -30,18 +29,6 @@ namespace paddle { namespace distributed { namespace auto_parallel { -// reduction rules -REGISTER_SPMD_RULE(all, ReductionSPMDRule); -REGISTER_SPMD_RULE(amax, ReductionSPMDRule); -REGISTER_SPMD_RULE(amin, ReductionSPMDRule); -REGISTER_SPMD_RULE(any, ReductionSPMDRule); -REGISTER_SPMD_RULE(frobenius_norm, ReductionSPMDRule); -REGISTER_SPMD_RULE(max, ReductionSPMDRule); -REGISTER_SPMD_RULE(mean, ReductionSPMDRule); -REGISTER_SPMD_RULE(min, ReductionSPMDRule); -REGISTER_SPMD_RULE(prod, ReductionSPMDRule); -REGISTER_SPMD_RULE(sum, ReductionSPMDRule); - // layer_norm rule REGISTER_SPMD_RULE(layer_norm, LayerNormSPMDRule); diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h index 40c66a669c9e8..6f032f4a5bd99 100644 --- a/paddle/phi/core/attribute.h +++ b/paddle/phi/core/attribute.h @@ -30,14 +30,17 @@ namespace phi { class Place; // NOTE: Add needed type in the future +// Move vector before vector, because when +// vector is before vector, a python integer +// list will be converted to vector in error. using Attribute = paddle::variant, std::vector, + std::vector, std::vector, std::vector, std::vector, diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc index a1895b6dfbd79..6e0c0f696fef4 100644 --- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc +++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc @@ -54,7 +54,7 @@ AttrType InferSpmdContext::AttrAt(size_t idx) const { } template <> -bool InferSpmdContext::AttrAt(size_t idx) const { +bool InferSpmdContext::AttrAt(size_t idx) const { try { auto attr = attrs_.at(idx); if (attr.type() == typeid(int)) { @@ -70,6 +70,24 @@ bool InferSpmdContext::AttrAt(size_t idx) const { } } +template <> +std::vector InferSpmdContext::AttrAt(size_t idx) const { + try { + auto attr = attrs_.at(idx); + if (attr.type() == typeid(std::vector)) { + std::vector val = PADDLE_GET_CONST(std::vector, attr); + return std::vector(val.begin(), val.end()); + } else { + return paddle::get>(attr); + } + } catch (paddle::bad_variant_access const& e) { + PADDLE_THROW(phi::errors::InvalidArgument( + "Attribute cast error in InferSpmd Context, the input attr type is " + "`%s`, but the expected attribute type is `bool`.", + attrs_.at(idx).type().name())); + } +} + const Attribute& InferSpmdContext::AttrAt(size_t idx) const { return attrs_.at(idx); } diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h index 3896bfcd6a2fe..23b147a4bb3d7 100644 --- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h @@ -138,8 +138,24 @@ struct InferSpmdFnImpl { } \ } +#define PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(attr_type) \ + template \ + struct InferSpmdFnCallHelper { \ + template \ + static SpmdInfo Call(const InferSpmdContext& ctx, \ + PreviousArgs&... pargs) { \ + attr_type arg = ctx.AttrAt(attr_idx); \ + return InferSpmdFnCallHelper::template Call( \ + ctx, pargs..., arg); \ + } \ + } + // TODO(chenweihang): support other attr type later as needed PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool); + PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(std::vector); + PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF( + std::vector); /* End case */ template diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc new file mode 100644 index 0000000000000..24c90a1792341 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/reduction.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/infermeta/spmd_rules/reduction.h" + +#include "glog/logging.h" + +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" +#include "paddle/phi/core/distributed/auto_parallel/utils.h" +#include "paddle/phi/infermeta/spmd_rules/utils.h" + +namespace phi { +namespace distributed { + +using phi::distributed::auto_parallel::str_join; + +////////////////// Utils Functions ////////////////// +std::string GetOutputNotation(int input_ndim, + const std::string& input_axes, + std::vector reduce_dims, + bool keep_dim) { + // convert the negative dim value to normal dim value + for (auto& reduce_dim : reduce_dims) { + if (reduce_dim < 0) { + reduce_dim = input_ndim + reduce_dim; + } + } + + std::string output_axes = ""; + for (int i = 0; i < input_ndim; i++) { + std::vector::iterator iter = + std::find(reduce_dims.begin(), reduce_dims.end(), i); + if (iter != reduce_dims.end()) { + // if i is reduce dim, the corresponding input axis + // will not be appended at the end of output_axes + if (keep_dim) { + output_axes.append(1, '1'); + } + } else { + // otherwise, the corresponding input axis + // will be appended at the end of output_axes + output_axes.append(1, input_axes[i]); + } + } + + return output_axes; +} + +SpmdInfo ReductionInferSpmd(const DistMetaTensor& x, + const std::vector& axis, + bool keep_dim) { + // Step0: Verify input args based on reduction logic + auto x_shape = phi::vectorize(x.dims()); + int x_ndim = x_shape.size(); + auto x_dist_attr_src = x.dist_attr(); + std::vector x_dims_mapping = x_dist_attr_src.dims_mapping(); + PADDLE_ENFORCE_EQ( + x_ndim, + x_dims_mapping.size(), + phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's " + "dims_mapping size [%d] are not matched.", + x_ndim, + x_dims_mapping.size())); + + // Step1: Build Einsum Notation + // get einsum notation for input + std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; + std::string x_axes = alphabet.substr(0, x_ndim); + + // get einsum notation for output + std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim); + + // Step2: Sharding Propogation + // Step2.1: Merge input shardings + std::pair> x_sharding_info(x_axes, + x_dims_mapping); + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors({x_sharding_info}); + + // Step2.2: Infer output dimsmapping from merged input dimsmapping + std::vector out_dims_mapping = + GetDimsMappingForAxes(out_axes, axis_to_dim_map); + + // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with + // input dist_attr. + TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src); + out_dist_attr.set_dims_mapping(out_dims_mapping); + + // Step3: handle partial + // Step3.1 Output Partial + std::vector partial_on_dims = + ResoluteOutputPartialDimension(axis_to_dim_map, out_axes); + out_dist_attr.set_partial_status( + partial_on_dims /*, handle reduce_type in future */); + + // Step3.2 handle input tensor partial (TODO) + // If the op is a linear op, i.e. `linearity` is true, it supports + // the input to be partial. Otherwise, the input cannot be partial + // on reduced axes, we should reshard the input when the reduced + // axes are parital. + VLOG(4) << "ReductionInferSpmd:"; + VLOG(4) << "axis: " << str_join(axis) << ", keep_dim: " << keep_dim; + VLOG(4) << "Einsum Notation: " << x_axes << " --> " << out_axes; + VLOG(4) << "Input0 shape: [" << str_join(x_shape) << "] " + << "dims_mapping: [" << str_join(x_dims_mapping) << "]"; + VLOG(4) << "Output dims_mapping: [" + str_join(out_dims_mapping) + "] " + << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n"; + + return {{x_dist_attr_src}, {out_dist_attr}}; +} + +SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x, + const DistMetaTensor& out, + const std::vector& axis, + bool keep_dim) { + // Step0: Verify input args based on reduction logic + auto x_shape = phi::vectorize(x.dims()); + auto out_shape = phi::vectorize(out.dims()); + int x_ndim = x_shape.size(); + int out_ndim = out_shape.size(); + auto out_dist_attr_src = out.dist_attr(); + std::vector out_dims_mapping = out_dist_attr_src.dims_mapping(); + PADDLE_ENFORCE_EQ( + out_ndim, + out_dims_mapping.size(), + phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's " + "dims_mapping size [%d] are not matched.", + out_ndim, + out_dims_mapping.size())); + + // Step1: Build einsum notation + // get einsum notation for input + std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; + std::string x_axes = alphabet.substr(0, x_ndim); + + // get einsum notation for output + std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim); + + // Step2: Sharding propogation + // Step2.1: Merge input shardings + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors({{out_axes, out_dims_mapping}}); + + // Step2.2: Infer input dims mapping from output dims mapping + std::vector x_dims_mapping = + GetDimsMappingForAxes(x_axes, axis_to_dim_map, true); + + // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with + // input dist_attr. + TensorDistAttr x_dist_attr_dst(x.dist_attr()); + x_dist_attr_dst.set_dims_mapping(x_dims_mapping); + + // Step3: handle partial (TODO) + + VLOG(4) << "ReductionInferSpmdReverse: "; + VLOG(4) << "Output shape:[" << str_join(out_shape) << "] dims_mapping: [" + << str_join(out_dims_mapping) << "]"; + VLOG(4) << "Input0: " + << "shape: [" << str_join(x_shape) << "] " + << "dims_mapping: [" << str_join(x_dims_mapping) << "]\n\n"; + + return {{x_dist_attr_dst}, {out_dist_attr_src}}; +} + +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/reduction.h b/paddle/phi/infermeta/spmd_rules/reduction.h new file mode 100644 index 0000000000000..ed9341ddc6904 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/reduction.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h" +#include "paddle/phi/core/distributed/type_defs.h" + +namespace phi { +namespace distributed { + +SpmdInfo ReductionInferSpmd(const DistMetaTensor& x, + const std::vector& axis, + bool keep_dim); + +SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x, + const DistMetaTensor& out, + const std::vector& axis, + bool keep_dim); + +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h index 4406e17495d14..71a726e3d8edc 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.h +++ b/paddle/phi/infermeta/spmd_rules/rules.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h" #include "paddle/phi/infermeta/spmd_rules/elementwise.h" #include "paddle/phi/infermeta/spmd_rules/matmul.h" +#include "paddle/phi/infermeta/spmd_rules/reduction.h" #include "paddle/phi/infermeta/spmd_rules/replicated.h" /** @@ -46,6 +47,16 @@ PD_REGISTER_SPMD_RULE(matmul, PD_INFER_SPMD(phi::distributed::MatmulInferSpmd), PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + elementwise_unary, + PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmd), + PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmdReverse)); + +PD_REGISTER_SPMD_RULE( + elementwise_binary, + PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd), + PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse)); + // default data parallel rule PD_REGISTER_SPMD_RULE( unsqueeze, @@ -408,5 +419,43 @@ PD_REGISTER_SPMD_RULE( // TODO(pkuzyc): add multiary elementwise rule +// reduction rule +PD_REGISTER_SPMD_RULE( + all, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + amax, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + amin, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + any, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + frobenius_norm, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + max, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + min, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + prod, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + sum, + PD_INFER_SPMD(phi::distributed::ReductionInferSpmd), + PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse)); + } // namespace distributed } // namespace phi diff --git a/test/auto_parallel/spmd_rules/test_reduction_rule.py b/test/auto_parallel/spmd_rules/test_reduction_rule.py index f8069ee226583..ea8398d246fcc 100644 --- a/test/auto_parallel/spmd_rules/test_reduction_rule.py +++ b/test/auto_parallel/spmd_rules/test_reduction_rule.py @@ -13,13 +13,14 @@ # limitations under the License. import unittest +from collections import OrderedDict -from paddle.distributed.auto_parallel.static.completion import get_spmd_rule from paddle.distributed.auto_parallel.static.dist_attribute import ( DistTensorSpec, TensorDistAttr, ) from paddle.distributed.fleet import auto +from paddle.framework import core class TestReductionSPMDRule(unittest.TestCase): @@ -28,7 +29,7 @@ class TestReductionSPMDRule(unittest.TestCase): """ def setUp(self): - self.rule = get_spmd_rule("max") + self.rule = core.get_phi_spmd_rule("max") x_shape = [64, 32] process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) @@ -40,11 +41,7 @@ def setUp(self): self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec) - self.attrs = { - 'keep_dim': False, - 'axis': [0], - 'linearity': False, - } + self.attrs = OrderedDict([('axis', [0]), ('keep_dim', False)]) def test_single_mesh_dim(self): # reduce on dim 0, keep_dim = false @@ -53,7 +50,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [0] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -73,7 +70,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [0] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -89,7 +86,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [1] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -104,7 +101,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [1] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -119,7 +116,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [0, 1] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -135,7 +132,7 @@ def test_single_mesh_dim(self): self.attrs['axis'] = [0, 1] self.x_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -156,7 +153,7 @@ def test_multi_mesh_dim(self): self.attrs['axis'] = [1, 2] self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -167,6 +164,7 @@ def test_multi_mesh_dim(self): self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), False) # reduce on dim 1, 2, keep_dim = false # [-1, 0, 1] --> [-1, 0, 1], [-1], partial_on_dim:[0, 1] @@ -174,7 +172,7 @@ def test_multi_mesh_dim(self): self.attrs['axis'] = [1, 2] self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -192,7 +190,7 @@ def test_multi_mesh_dim(self): self.attrs['axis'] = [1, 2] self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -207,7 +205,7 @@ def test_multi_mesh_dim(self): self.attrs['axis'] = [1, 2] self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -225,7 +223,7 @@ def test_multi_mesh_dim(self): self.attrs['axis'] = [1, 2] self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1]) result_dist_attrs = self.rule.infer_forward( - [self.x_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim'] ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -243,7 +241,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [32] self.out_dist_tensor_spec.set_dims_mapping([-1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -262,7 +263,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [1, 32] self.out_dist_tensor_spec.set_dims_mapping([-1, -1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -277,7 +281,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [64] self.out_dist_tensor_spec.set_dims_mapping([0]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -292,7 +299,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [64, 1] self.out_dist_tensor_spec.set_dims_mapping([0, -1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -307,7 +317,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [] self.out_dist_tensor_spec.set_dims_mapping([]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -322,7 +335,10 @@ def test_backward_single_mesh_dim(self): self.out_dist_tensor_spec.shape = [1, 1] self.out_dist_tensor_spec.set_dims_mapping([-1, -1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -343,7 +359,10 @@ def test_backward_multi_mesh_dim(self): self.out_dist_tensor_spec.shape = [96] self.out_dist_tensor_spec.set_dims_mapping([0]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -362,7 +381,10 @@ def test_backward_multi_mesh_dim(self): self.out_dist_tensor_spec.shape = [96] self.out_dist_tensor_spec.set_dims_mapping([-1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -377,7 +399,10 @@ def test_backward_multi_mesh_dim(self): self.out_dist_tensor_spec.shape = [96] self.out_dist_tensor_spec.set_dims_mapping([1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] @@ -392,13 +417,48 @@ def test_backward_multi_mesh_dim(self): self.out_dist_tensor_spec.shape = [96, 1, 1] self.out_dist_tensor_spec.set_dims_mapping([0, -1, -1]) result_dist_attrs = self.rule.infer_backward( - [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1]) + + def test_backward_multi_mesh_dim_parital(self): + # reduction on dim 1, 2, keep_dim = true, partial_dim=[1] + # [0, -1, -1] --> [0, -1, -1], [0, -1, -1] (output --> input, output) + # output parital_dim: [1], input parital_dim: [] + out_shape = [96, 1, 1] + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]]) + + self.x_dist_tensor_spec.set_process_mesh(process_mesh) + self.x_dist_tensor_spec.shape = [96, 24, 48] + out_tensor_dist_attr = TensorDistAttr() + out_tensor_dist_attr.dims_mapping = [0, -1, -1] + out_tensor_dist_attr.process_mesh = process_mesh + out_tensor_dist_attr._set_partial_dims([1]) + self.out_dist_tensor_spec = DistTensorSpec( + out_shape, out_tensor_dist_attr + ) + + self.attrs['keep_dim'] = True + self.attrs['axis'] = [1, 2] + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['axis'], + self.attrs['keep_dim'], ) infered_input_dist_attrs = result_dist_attrs[0] infered_output_dist_attrs = result_dist_attrs[1] self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_input_dist_attrs[0]._is_partial(), False) if __name__ == "__main__": From 4920462600b26c1050c29ac1caafc3fac72362ba Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 20 Sep 2023 19:48:41 +0800 Subject: [PATCH 02/39] [Dy2St]Modify jit.load into Lazy Initialization Mode for backward program (#57240) * [Dy2St]Modify jit.load into Lazy Initialization Mode for backward program * fix is_test * fix typo * fix logic * fix build scope logic --- .../eager/to_static/run_program_op_func.h | 7 ++- .../eager/to_static/run_program_op_node.h | 45 +++++++++++-------- paddle/fluid/framework/executor_cache.cc | 23 +++++----- paddle/fluid/framework/executor_cache.h | 2 +- python/paddle/jit/translated_layer.py | 22 ++++----- 5 files changed, 55 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index f0ca7c1518b24..a3bb3a2879300 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -140,8 +140,11 @@ inline void run_program_ad_func( RunProgramAPI( x_tmp, params_tmp, out, step_scope, dout, require_any_grad, attrs); VLOG(2) << "start run run_program grad"; - - if (require_any_grad) { + auto is_test = false; + if (attrs.count("is_test")) { + is_test = PADDLE_GET_CONST(bool, attrs.at("is_test")); + } + if (!is_test && require_any_grad) { auto x_names = PADDLE_GET_CONST(std::vector, attrs.at("x_names")); diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index ebab84ccd1521..fd0d6563945a5 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -315,14 +315,16 @@ static void ShareTensorsFromScopeByValue( static void ShareTensorsFromScopeWithPartialBlock( const std::vector &tensors, const paddle::framework::BlockDesc &forward_global_block, - const paddle::framework::BlockDesc &backward_global_block, + const paddle::framework::BlockDesc *backward_global_block, paddle::framework::Scope *scope) { for (size_t i = 0; i < tensors.size(); ++i) { auto &name = tensors[i]->name(); + bool in_forward_block = forward_global_block.HasVar(name); + bool in_backward_block = + backward_global_block && backward_global_block->HasVar(name); if (name == paddle::framework::kEmptyVarName || name == paddle::framework::kFakeVarName || - (!forward_global_block.HasVar(name) && - !backward_global_block.HasVar(name))) { + (!in_forward_block && !in_backward_block)) { VLOG(2) << "find tensor name is " << name << ", skip it!"; continue; } @@ -660,10 +662,16 @@ inline void RunProgramAPI( auto *forward_global_block = PADDLE_GET_CONST( paddle::framework::BlockDesc *, attrs.at("forward_global_block")); - auto *backward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc *, attrs.at("backward_global_block")); auto *forward_program = forward_global_block->Program(); - auto *backward_program = backward_global_block->Program(); + + paddle::framework::BlockDesc *backward_global_block = nullptr; + paddle::framework::ProgramDesc *backward_program = nullptr; + + if (!is_test) { + backward_global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *, + attrs.at("backward_global_block")); + backward_program = backward_global_block->Program(); + } auto &interpretercore_info_cache = paddle::framework::InterpreterCoreInfoCache::Instance(); @@ -710,9 +718,12 @@ inline void RunProgramAPI( global_inner_scope); } // Step 3. get all eager gc vars - std::set skip_eager_delete_vars = - paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( - *backward_program); + std::set skip_eager_delete_vars; + if (!is_test) { + skip_eager_delete_vars = + paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( + *backward_program); + } // all out_vars are skip_eager_var skip_eager_delete_vars.insert(output_names.begin(), output_names.end()); @@ -765,19 +776,15 @@ inline void RunProgramAPI( 1); interpreter_core->Run({}); } - + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); { paddle::platform::RecordEvent record_event( "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); // Get Output details::ShareTensorsFromScopeWithPartialBlock( - out, *forward_global_block, *backward_global_block, global_inner_scope); - details::ShareTensorsFromScopeWithPartialBlock(dout, - *forward_global_block, - *backward_global_block, - global_inner_scope); - - VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + out, *forward_global_block, backward_global_block, global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock( + dout, *forward_global_block, backward_global_block, global_inner_scope); if (is_test || !require_any_grad) { VLOG(4) << "don't require any grad, set this scope can reused"; @@ -939,11 +946,11 @@ inline void RunProgramGradAPI( // Step 4. get outputs details::ShareTensorsFromScopeWithPartialBlock(x_grad, *forward_global_block, - *backward_global_block, + backward_global_block, global_inner_scope); details::ShareTensorsFromScopeWithPartialBlock(params_grad, *forward_global_block, - *backward_global_block, + backward_global_block, global_inner_scope); VLOG(4) << "after backward gc all vars"; global_inner_scope->SetCanReused(true); diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 1044f785451e0..64d5ce24d20fe 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -356,7 +356,7 @@ std::shared_ptr CreateNewIRInterpreterCoreInfoToCache( std::unique_ptr<::pir::Program> ConstructFowardIrProgram( const paddle::framework::BlockDesc *forward_global_block, const paddle::framework::BlockDesc *backward_global_block, - const std::vector output_names, + const std::vector &output_names, const std::vector &x, const std::vector &x_names, const std::vector ¶ms, @@ -415,19 +415,21 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram( } std::set set_parameter_names; - for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) { - for (const auto &n : op_desc->Inputs()) { - const auto &input_var_names = n.second; - for (const auto &var_name : input_var_names) { - set_parameter_names.insert(var_name); - } - } - } - for (auto &t : output_names) { set_parameter_names.insert(t); } + if (backward_global_block != nullptr) { + for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) { + for (const auto &n : op_desc->Inputs()) { + const auto &input_var_names = n.second; + for (const auto &var_name : input_var_names) { + set_parameter_names.insert(var_name); + } + } + } + } + for (auto &name : set_parameter_names) { if (!set_output_names.count(name)) { continue; @@ -443,7 +445,6 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram( op_desc->SetInput("x", {name}); op_desc->SetOutput("out", {"@EMPTY@"}); } - paddle::translator::ProgramTranslator program_translator(&local_program, program.get()); diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index f55808175f09f..d30ed6396e65e 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -253,7 +253,7 @@ std::shared_ptr CreateNewIRInterpreterCoreInfoToCache( std::unique_ptr<::pir::Program> ConstructFowardIrProgram( const paddle::framework::BlockDesc* forward_global_block, const paddle::framework::BlockDesc* backward_global_block, - const std::vector output_names, + const std::vector& output_names, const std::vector& x, const std::vector& x_names, const std::vector& params, diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py index a7f51c1a8c164..fce3211f23878 100644 --- a/python/paddle/jit/translated_layer.py +++ b/python/paddle/jit/translated_layer.py @@ -347,15 +347,11 @@ def __init__(self, program_desc): self._suffix_varname_dict = None # forward program self._infer_program_desc = self._preprocess(program_desc) - # forward + backward program - self._train_program_desc = self._append_backward_desc( - self._infer_program_desc - ) # forward: @switch_to_static_graph def _create_forward_train_program(self): - whole_program = _build_program_by_desc(self._train_program_desc) + whole_program = _build_program_by_desc(self.train_program) end_op_index = self._infer_program_desc.block(0).op_size() if end_op_index > 0: return add_build_strategy_for(whole_program, 0, end_op_index) @@ -369,7 +365,7 @@ def _forward_program_desc(self): # backward @switch_to_static_graph def _create_backward_train_program(self): - whole_program = _build_program_by_desc(self._train_program_desc) + whole_program = _build_program_by_desc(self.train_program) start_op_index = self._infer_program_desc.block(0).op_size() + len( self._output_descs ) @@ -389,9 +385,9 @@ def _backward_program_desc(self): def infer_program(self): return self._infer_program_desc - @property + @LazyInitialized def train_program(self): - return self._train_program_desc + return self._append_backward_desc(self._infer_program_desc) @property def forward_program(self): @@ -1010,10 +1006,15 @@ def _run_dygraph(instance, input, program_holder): ( 'forward_global_block', forward_program.block(0), - 'backward_global_block', - program_holder.backward_program.block(0), ) ) + if not instance._is_test: + attrs.extend( + ( + 'backward_global_block', + program_holder.backward_program.block(0), + ) + ) _legacy_C_ops.run_program( _valid_vars(input_vars), @@ -1055,7 +1056,6 @@ def _run_static_graph(input, program_holder, trace_program): trace_program, exclude=param_var_names ) trace_program.flush() - output_names = [var.name() for var in program_holder.output_descs] # append blocks from 'trace_program' _append_block( main_program, From 70fe4b4961ce72adcf0a90532cd159e112feac58 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:50:45 +0800 Subject: [PATCH 03/39] [gpups ci] (#52962) * gpups information * Update gpups_test.sh * modify gpups,test=document_fix --- tools/gpups_test.sh | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index 86be766397652..31ad58a86456e 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -13,6 +13,20 @@ # limitations under the License. +function collect_failed_tests() { + for file in `ls $tmp_dir`; do + exit_code=0 + grep -q 'The following tests FAILED:' $tmp_dir/$file||exit_code=$? + if [ $exit_code -ne 0 ]; then + failuretest='' + else + failuretest=`grep -A 10000 'The following tests FAILED:' $tmp_dir/$file | sed 's/The following tests FAILED://g'|sed '/^$/d'` + failed_test_lists="${failed_test_lists} + ${failuretest}" + fi + done +} + serial_list="^test_conv2d_op$|\ ^test_conv2d_transpose_op$|\ ^test_conv3d_op$" @@ -48,7 +62,6 @@ parallel_list="^init_phi_test$|\ ^test_dygraph_sharding_stage2_bf16$|\ ^test_executor_feed_non_tensor$|\ ^test_flash_attention$|\ -^test_flash_attention_deterministic$|\ ^test_fused_adam_op$|\ ^test_fused_attention_no_dropout$|\ ^test_fused_attention_op$|\ @@ -93,16 +106,24 @@ parallel_list="^init_phi_test$|\ ^test_top_k_v2_op$" cd ${work_dir}/build - +tmp_dir=`mktemp -d` +tmpfile_rand=`date +%s%N` +tmpfile=$tmp_dir/$tmpfile_rand"_"$i set +e -ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4 +ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0; EXIT_CODE_1=$? -ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1 +ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0; EXIT_CODE_2=$? set -e if [ "${EXIT_CODE_1}" != "0" ] || [ "${EXIT_CODE_2}" != "0" ];then echo "Sorry, some tests failed." + collect_failed_tests + rm -f $tmp_dir/* + echo "Summary Failed Tests... " + echo "========================================" + echo "The following tests FAILED: " + echo "${failuretest}" | sort -u exit 8 fi From 0cb7a2812829263dc5bab3597b7bd07127e81bd6 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 20 Sep 2023 20:13:02 +0800 Subject: [PATCH 04/39] correct default_dtype for ones, zeros, linspace, logspace, eye, full (#57487) --- python/paddle/tensor/creation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index d6cad4b8eca34..c3e814cc906d4 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -303,7 +303,7 @@ def linspace(start, stop, num, dtype=None, name=None): """ if dtype is None: - dtype = 'float32' + dtype = paddle.get_default_dtype() tensor_num = num tensor_start = start tensor_stop = stop @@ -434,7 +434,7 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None): [1.] """ if dtype is None: - dtype = 'float32' + dtype = paddle.get_default_dtype() tensor_num = num tensor_start = start tensor_stop = stop @@ -1010,7 +1010,7 @@ def ones(shape, dtype=None, name=None): [1. 1.]] """ if dtype is None: - dtype = core.VarDesc.VarType.FP32 + dtype = paddle.get_default_dtype() return fill_constant(value=1.0, shape=shape, dtype=dtype, name=name) @@ -1094,7 +1094,7 @@ def zeros(shape, dtype=None, name=None): [0. 0.]] """ if dtype is None: - dtype = 'float32' + dtype = paddle.get_default_dtype() return fill_constant(value=0.0, shape=shape, dtype=dtype, name=name) @@ -1176,8 +1176,8 @@ def _check_attr(attr, message): _check_attr(num_rows, "num_rows") if dtype is None: - dtype = core.VarDesc.VarType.FP32 - elif not isinstance(dtype, core.VarDesc.VarType): + dtype = paddle.get_default_dtype() + if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) if num_columns is not None: _check_attr(num_columns, "num_columns") @@ -1270,7 +1270,7 @@ def full(shape, fill_value, dtype=None, name=None): """ if dtype is None: - dtype = 'float32' + dtype = paddle.get_default_dtype() return fill_constant(shape=shape, dtype=dtype, value=fill_value, name=name) From be7ae2c74d19fc0ea0c1e205478389b98c537595 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 20 Sep 2023 21:13:00 +0800 Subject: [PATCH 05/39] Try to fix performance drop. (#57525) --- paddle/phi/kernels/gpu/flip_kernel.cu | 7 ++++++- paddle/phi/kernels/gpu/index_put_grad_kernel.cu | 12 ++++++++++-- paddle/phi/kernels/gpu/index_put_kernel.cu | 6 +++++- paddle/phi/kernels/gpu/roll_kernel_impl.h | 6 +++++- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu index f271eba26e0ab..71fdbcaaa68bb 100644 --- a/paddle/phi/kernels/gpu/flip_kernel.cu +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -40,7 +40,12 @@ __global__ void FlipCudaKernel(const T* in_data, int64_t cur_indices = idx; int64_t rem = 0; int64_t dst_offset = 0; - for (int i = 0; i < rank; ++i) { + +#pragma unroll + for (int i = 0; i < DDim::kMaxRank; ++i) { + if (i >= rank) { + break; + } int64_t temp = cur_indices; cur_indices = cur_indices / stride[i]; rem = temp - cur_indices * stride[i]; diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu index 7e584e5c10318..915c7f40fa2cb 100644 --- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu @@ -40,7 +40,11 @@ __global__ void SetZeroCudaKernel(int64_t** indices, int64_t cur_ix = 0; int64_t offset = 0; - for (int i = 0; i < rank; ++i) { +#pragma unroll + for (int i = 0; i < DDim::kMaxRank; ++i) { + if (i >= rank) { + break; + } cur_ix = (static_cast(*(indices[i] + idx))); if (cur_ix < 0) { cur_ix += shape[i]; @@ -69,7 +73,11 @@ __global__ void IndexPutGradCudaKernel( int64_t cur_ix = 0; int64_t offset = 0; - for (int i = 0; i < rank; ++i) { +#pragma unroll + for (int i = 0; i < DDim::kMaxRank; ++i) { + if (i >= rank) { + break; + } cur_ix = (static_cast(*(indices[i] + idx))); if (cur_ix < 0) { cur_ix += shape[i]; diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu index ccbd19aaba681..3af220ce16b31 100644 --- a/paddle/phi/kernels/gpu/index_put_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_kernel.cu @@ -41,7 +41,11 @@ __global__ void IndexPutCudaKernel(const T* x, return; } int64_t offset = 0; - for (int i = 0; i < rank; ++i) { +#pragma unroll + for (int i = 0; i < DDim::kMaxRank; ++i) { + if (i >= rank) { + break; + } cur_ix = (static_cast(*(indices[i] + idx))); if (cur_ix < 0) { cur_ix += shape[i]; diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h index 38e2a6ff669ad..c7ffcb2d5ca52 100644 --- a/paddle/phi/kernels/gpu/roll_kernel_impl.h +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -40,7 +40,11 @@ __global__ void RollCudaKernel(const T* input, int64_t output_idx = idx; int64_t new_dim_idx = 0; - for (size_t i = 0; i < rank; i++) { +#pragma unroll + for (size_t i = 0; i < DDim::kMaxRank; i++) { + if (i >= rank) { + break; + } new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i]; if (new_dim_idx >= sizes[i]) { output_idx += (shifts[i] - sizes[i]) * strides[i]; From c5d0e0c6b6930f8e25d24bf9c1ff189657552726 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 21 Sep 2023 07:24:41 +0800 Subject: [PATCH 06/39] sharding stage 2 main grad bug fix (#57537) --- python/paddle/distributed/sharding/group_sharded.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 2bbc93259eaa8..350f6eff4d001 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -28,6 +28,9 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( GroupShardedScaler, ) +from paddle.distributed.fleet.utils.mix_precision_utils import ( + MixPrecisionOptimizer, +) from paddle.distributed.utils.log_utils import get_logger from paddle.optimizer import Optimizer @@ -111,9 +114,10 @@ def group_sharded_parallel( assert isinstance( model, paddle.nn.Layer ), "The model must be the instance of paddle.nn.Layer." - assert isinstance( - optimizer, Optimizer - ), "The optimizer must be the instance of paddle.optimizer.Optimizer." + assert isinstance(optimizer, (MixPrecisionOptimizer, Optimizer)), ( + "The optimizer must be the instance of paddle.optimizer.Optimizer " + "or MixPrecisionOptimizer for main grad." + ) assert level in [ 'os', 'os_g', From 058b008e721e87c2f7b25079d49c66b47849d175 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Thu, 21 Sep 2023 09:11:47 +0800 Subject: [PATCH 07/39] =?UTF-8?q?=E3=80=90pir=E3=80=91add=20all=20Slice=20?= =?UTF-8?q?newir=20test=20=20(#57529)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add reference of lbfgs * add reference of lbfgs * tmp * split gen modify * fix conflict * add split * fix bug * fix bug * test split * add meta tensor * refine code * fix bug * fix bug * fix comflict * Call _C_ops.sum in new ir * modify concat kernel choose * modify ci * modify sum zero_dim optest * modify split_with_num api * modify split -1 * modify split test * fix bug * xxx * delete extra modify * add add_n * tmp * add split_with_num_grad * expand first * expand first * modify split grad num bug * modify ci * modify ci * clear code * modify * recover * add add_n stop_gradient infer * modify opreslut to value * fix conflict * recover to aviod conflict * recover to aviod conflict * modify opreslut to value * recover complex tanh * modify add_n optest * skip bfp16 * modify split bf16 * fix conflict * modify expand special case * delete print * code style * slice optest pass --------- Co-authored-by: zhangbo9674 Co-authored-by: 0x45f --- .../pir/dialect/op_generator/op_build_gen.py | 1 + test/legacy_test/test_slice_op.py | 56 +++++++++++++------ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py index bfb20bb8e283d..33bb81e43bf64 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py @@ -19,6 +19,7 @@ 'SplitWithNumInferMeta', 'ConcatInferMeta', 'ReduceIntArrayAxisInferMeta', + 'SliceRawInferMeta', } _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'} diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py index 194e933e1d0ec..065251b246928 100644 --- a/test/legacy_test/test_slice_op.py +++ b/test/legacy_test/test_slice_op.py @@ -71,7 +71,11 @@ def test_check_output(self): def test_check_grad_normal(self): self.check_grad( - ['Input'], 'Out', max_relative_error=0.006, check_prim=True + ['Input'], + 'Out', + max_relative_error=0.006, + check_prim=True, + check_new_ir=True, ) @@ -157,7 +161,11 @@ def test_check_output(self): def test_check_grad_normal(self): self.check_grad( - ['Input'], 'Out', max_relative_error=0.006, check_prim=True + ['Input'], + 'Out', + max_relative_error=0.006, + check_prim=True, + check_new_ir=True, ) @@ -195,10 +203,12 @@ def config(self): self.starts_infer = [-1, 0, -1] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) # Situation 2: starts(list, have tensor), ends(list, no tensor) @@ -238,10 +248,12 @@ def config(self): self.starts_infer = [1, -1, 2] def test_check_output(self): - self.check_output() + self.check_output(check_dygraph=True, check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) class TestSliceOp_decs_dim_5_starts_ListTensor( @@ -289,10 +301,12 @@ def config(self): self.out = self.input[1, 0:3, 2:4, :] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) # Situation 4: starts(tensor), ends(tensor) @@ -325,10 +339,12 @@ def config(self): self.out = self.input[1:3, 0:3, 2:4, :] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) # Situation 5: starts(tensor), ends(tensor) @@ -362,10 +378,12 @@ def config(self): self.out = self.input[1, 0, 2:4, :] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) # Situation 6: starts(tensor), ends(list, have tensor) @@ -406,10 +424,12 @@ def config(self): self.ends_infer = [-1, 3, 4] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', max_relative_error=0.006) + self.check_grad( + ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True + ) class TestSliceOp_ZeroDim(OpTest): @@ -448,10 +468,10 @@ def config(self): self.out = self.input[0:20, 1:3, 1:3] def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out') + self.check_grad(['Input'], 'Out', check_new_ir=True) # Test CUDA float16 @@ -499,6 +519,7 @@ def test_check_grad_normal(self): ['Input'], 'Out', check_prim=True, + check_new_ir=True, ) @@ -546,6 +567,7 @@ def test_check_grad_normal(self): 'Out', numeric_grad_delta=0.5, check_prim=True, + check_new_ir=True, ) @@ -578,7 +600,7 @@ def test_check_output(self): self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['Input'], 'Out', check_prim=True) + self.check_grad(['Input'], 'Out', check_prim=True, check_new_ir=True) # Test python API From 164abf27d2ae1d8e90691b26bc01789002535d46 Mon Sep 17 00:00:00 2001 From: Sonder <55493212+AndSonder@users.noreply.github.com> Date: Thu, 21 Sep 2023 09:49:37 +0800 Subject: [PATCH 08/39] Support control flow for static build [Step 2: support conditional_block] (#56696) * add conditional_block to OperatorBasesHandledInStaticBuild * run op in FakeInitializeOutputsForOperatorBase * add init_success judge * fix build error * fix * add SetSubBlockCore func * add PreStaticRun func * add PreStaticRun to interpreter_base and new_ir_inter * recover codes * add PreStaticBuild and BlockCanBeStaticBuilt * fix logic about RunPreStaticBuild * change CreateOpFromOpDesc type * fix build error * fix build error * remove IsOperatorBasesHandledInStaticBuild * recover BlockCanBeStaticBuilt * add logic about conditional_block run static build * recover codes * recover BlockCanBeStaticBuilt * support static build condational block op when condational block is the last op in the block * fix error * fix logic about last op * fit for sub block can't open static build * add IsStaticBuild * fix build error * fit logic when sub block can't open static build * close static build when sub_block don't support static_build * recover third party * add is_skil_fake_init logic * set the backend of the lamb * change start index * add if conditional for cal is_skip_fake_init * change name * close static_build for test_conditional_block * add static buiild support for conditional block in case of the output's dtype/place is changed but the following op is not use this output * fix logic error * fix timeout error * fix * remove useless codes * fix * fix * fix build error * move GetVarsInfo and RunPreStaticBuild from opeartor to static_build * fix lamb backend registe * fix build error * fix build error * remove lamp op test from new_ir_op_test_white_list * fix * move generating following_input_vars logic to static_build.cc * remove HasInfo * fix build error * recover codes and turn off the flag --- .../interpreter/interpreter_util.cc | 26 +- .../new_executor/interpreter/static_build.cc | 222 ++++++++++++++++-- .../new_executor/interpreter/static_build.h | 38 ++- .../new_executor/interpreter_base_impl.h | 6 + .../framework/new_executor/interpretercore.cc | 8 + .../framework/new_executor/interpretercore.h | 5 + .../new_executor/new_ir_interpreter.cc | 7 + .../new_executor/new_ir_interpreter.h | 6 + .../new_executor/program_interpreter.cc | 57 +++-- .../new_executor/program_interpreter.h | 6 +- paddle/phi/kernels/gpu/lamb_kernel.cu | 2 + test/legacy_test/CMakeLists.txt | 4 + test/white_list/new_ir_op_test_white_list | 2 - 13 files changed, 332 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 67106932169a3..8015a50545e69 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -527,11 +527,13 @@ platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op, return default_dev_ctx; } -void HandleOperatorBase(const platform::Place& place, - std::shared_ptr op, - OpFuncNode* op_func_node, - Scope* scope, - bool static_build) { +void HandleOperatorBase( + const platform::Place& place, + std::shared_ptr op, + OpFuncNode* op_func_node, + Scope* scope, + bool static_build, + std::vector> following_ops) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); // input, output is prepared. set the other attributes. @@ -542,7 +544,8 @@ void HandleOperatorBase(const platform::Place& place, if (OperatorBasesMustRunInStaticBuild.count(op->Type())) { op->Run(*scope, place); } - FakeInitializeOutputsForOperatorBase(*op, place, scope); + + FakeInitializeOutputsForOperatorBase(*op, place, scope, following_ops); } else { op->Run(*scope, place); // Run without data transformer. } @@ -690,8 +693,15 @@ void BuildOpFuncList(const platform::Place& place, if (dynamic_cast(op) == nullptr) { VLOG(4) << "HandleOperatorBase"; // op is not a operatorwithkernel, so direcly run OperatorBase::Run() - HandleOperatorBase( - place, ops[i], &op_func_node, local_scope, static_build); + + std::vector> following_ops( + ops.begin() + i + 1, ops.end()); + HandleOperatorBase(place, + ops[i], + &op_func_node, + local_scope, + static_build, + following_ops); vec_func_list->emplace_back(op_func_node); } else { VLOG(4) << "OP is not null"; diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc index 69b4920050925..0f9bd3f387a92 100644 --- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc @@ -15,11 +15,18 @@ #include "paddle/fluid/framework/new_executor/interpreter/static_build.h" #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h" +#ifdef PADDLE_WITH_DNNL +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + // These Ops is OperatorBase, but we have been handle them in static build -std::set OperatorBasesHandledInStaticBuild = {"read"}; +std::set OperatorBasesHandledInStaticBuild = {"read", + "conditional_block"}; std::set OperatorBasesMustRunInStaticBuild = { "create_double_buffer_reader", "create_py_reader"}; @@ -53,11 +60,68 @@ namespace paddle { namespace framework { namespace interpreter { +using InterpreterCore = framework::InterpreterCore; + +static VarMetaInfo GetVarMetaInfo(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + phi::DataType dtype = phi::DataType::UNDEFINED; + phi::Place place = phi::Place(); + if (var == nullptr) { + return VarMetaInfo(name, dtype, place); + } + + if (var->IsType()) { + const phi::DenseTensor& tensor = var->Get(); + if (!UNLIKELY(!tensor.IsInitialized())) { + dtype = tensor.dtype(); + place = tensor.place(); + } + } else if (var->IsType()) { + auto tensor = var->Get().value(); + if (!UNLIKELY(!tensor.IsInitialized())) { + dtype = tensor.dtype(); + place = tensor.place(); + } + } + return VarMetaInfo(name, dtype, place); +} + +std::vector GetVarsInfo(const Scope* scope, + VariableNameMap var_map, + const OperatorBase& op) { + std::vector var_info; + + const std::unordered_set* no_need_buffer_vars = nullptr; + if (op.Info().NoNeedBufferVarsInferer()) { + no_need_buffer_vars = &(op.Info().NoNeedBufferVarsInferer()( + op.Inputs(), op.Outputs(), op.Attrs())); + if (no_need_buffer_vars->empty()) no_need_buffer_vars = nullptr; + } + for (auto it = var_map.begin(); it != var_map.end();) { + auto& var = *it; + bool is_no_need_buffer_var = + (no_need_buffer_vars && no_need_buffer_vars->count(var.first) > 0); + std::string var_name; + var_info.reserve(var_info.size() + var.second.size()); + for (size_t i = 0; i < var.second.size(); ++i) { + auto var_name = var.second[i]; + if (scope && is_no_need_buffer_var) { + var_info.emplace_back(GetVarMetaInfo(*scope, var_name)); + } else { + var_info.emplace_back(var_name); + } + } + ++it; + } + return var_info; +} + bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) { - // in_black_list = (kernelCode >> 7) & 1 - // is_operator_base = (kernelCode >> 6) & 1 - // is_custom_op = (kernelCode >> 5) & 1 - // use_mkldnn = (kernelCode >> 4) & 1 + // in_black_list = (kernelCode >> 5) & 1 + // is_operator_base = (kernelCode >> 4) & 1 + // is_custom_op = (kernelCode >> 3) & 1 + // use_mkldnn = (kernelCode >> 2) & 1 + // sub_block_can_not_static_build = (kernelCode >> 1) & 1 using KernelCode = int8_t; std::set> invalid_ops; for (auto& op : block.AllOps()) { @@ -77,17 +141,22 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) { use_mkldnn = attr.index() == 1 ? PADDLE_GET_CONST(int, attr) : PADDLE_GET_CONST(bool, attr); } - bool has_structured_kernel = - phi::KernelFactory::Instance().HasStructuredKernel(op_type); + + bool sub_block_can_not_static_build = false; + if (op->HasAttr("sub_block")) { + auto* sub_block = + PADDLE_GET_CONST(framework::BlockDesc*, op->GetAttr("sub_block")); + sub_block_can_not_static_build = !BlockCanBeStaticBuilt(*sub_block); + } KernelCode kernel_code = static_cast( - (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) + - (use_mkldnn << 4) + (has_structured_kernel << 2)); + (in_black_list << 5) + (is_operator_base << 4) + (is_custom_op << 3) + + (use_mkldnn << 2) + (sub_block_can_not_static_build << 1)); if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) { if (in_black_list || (is_operator_base && !OperatorBasesHandledInStaticBuild.count(op_type)) || - is_custom_op || use_mkldnn) { + is_custom_op || use_mkldnn || sub_block_can_not_static_build) { invalid_ops.insert(std::make_pair(op_type, kernel_code)); } } @@ -97,11 +166,12 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) { std::stringstream ss; ss << "The following OPs are unable to static build:\n"; for (auto& item : invalid_ops) { - ss << item.first << " [in_black_list = " << (item.second >> 7 & 1) - << ", is_operator_base = " << (item.second >> 6 & 1) - << ", is_custom_op = " << (item.second >> 5 & 1) - << ", use_mkldnn = " << (item.second >> 4 & 1) - << (item.second >> 2 & 1) << "]\n"; + ss << item.first << " [in_black_list = " << (item.second >> 6 & 1) + << ", is_operator_base = " << (item.second >> 5 & 1) + << ", is_custom_op = " << (item.second >> 4 & 1) + << ", use_mkldnn = " << (item.second >> 3 & 1) + << ", sub_block_can_not_static_build = " << (item.second >> 1 & 1) + << "]\n"; } VLOG(1) << ss.str(); } @@ -318,9 +388,59 @@ void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx, } } -void FakeInitializeOutputsForOperatorBase(const OperatorBase& op, - const phi::Place& place, - Scope* scope) { +void RunPreStaticBuild(const framework::Scope& scope, + const platform::Place& dev_place, + const OperatorBase& op) { + auto* scope_var = scope.FindVar(op.Output("Scope")); + PADDLE_ENFORCE_NOT_NULL( + scope_var, + platform::errors::PreconditionNotMet( + "Expect Scope variable to be set in conditional_block_op, but " + "got a null Scope variable. Please set the Scope variable.")); + + auto* scopes = scope_var->GetMutable>(); + scopes->resize(1); + scopes->front() = &scope.NewScope(); + + auto& cur_scope = *scopes->front(); +#ifdef PADDLE_WITH_DNNL + // Executor on being destroyed clears oneDNN cache and resets + // registered model data layout. This is unwanted for nested + // Executors (executors declared inside control ops) + platform::DontClearMKLDNNCache(dev_place); +#endif + auto* block = op.Attr("sub_block"); + VLOG(3) << "Conditional block.idx = " << block->ID() + << ", scope = " << &cur_scope; + + auto& skip_vars = + op.Attr>("skip_eager_deletion_vars"); + + std::unique_ptr core; + LOG_FIRST_N(INFO, 1) + << "[ControlFlow][ConditionalBlock] New Executor is Running."; + + VLOG(10) << "[interpreterCore cache]" << core.get(); + VLOG_IF(10, core) << platform::is_same_place(core->GetPlace(), dev_place); + + framework::interpreter::ExecutionConfig execution_config; + execution_config.create_local_scope = false; + execution_config.used_for_control_flow_op = true; + execution_config.skip_gc_vars = + std::set(skip_vars.begin(), skip_vars.end()); + + core.reset( + new InterpreterCore(dev_place, *block, &cur_scope, execution_config)); + + std::vector op_func_nodes; + core->Build({}, &op_func_nodes); +} + +void FakeInitializeOutputsForOperatorBase( + const OperatorBase& op, + const phi::Place& place, + Scope* scope, + std::vector> following_ops) { const std::string& op_type = op.Type(); if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) { return; @@ -329,7 +449,59 @@ void FakeInitializeOutputsForOperatorBase(const OperatorBase& op, phi::DeviceContext* dev_ctx = platform::DeviceContextPool::Instance().Get(place); - if (op_type == "read") { + if (op_type == "conditional_block") { + // Note(sonder): skip fake init for conditional_block when there is no + // op with kernel after it. + bool skip_fake_init = true; + std::unordered_set following_input_vars; + + for (size_t i = 0; i < following_ops.size(); ++i) { + if (dynamic_cast( + following_ops[i].get()) != nullptr) { + VLOG(4) << "Find op with kernel after conditional_block : " + << following_ops[i]->Type(); + skip_fake_init = false; + auto input_vars_info = GetVarsInfo( + scope, following_ops[i]->Inputs(), *following_ops[i].get()); + for (auto& input_var_info : input_vars_info) { + following_input_vars.insert(input_var_info.name_); + } + } + } + + if (skip_fake_init) { + return; + } + + const std::vector out_var_info_before_build = + GetVarsInfo(scope, op.Outputs(), op); + + RunPreStaticBuild(*scope, place, op); + const std::vector out_var_info_after_build = + GetVarsInfo(scope, op.Outputs(), op); + + // Note(sonder): static_build is not supported if the output of + // conditional_block is changed after static build. + for (size_t i = 0; i < out_var_info_before_build.size(); ++i) { + // static build is supported in case of the output's dtype/place + // is changed but the following op is not use this output + if (out_var_info_before_build[i] != out_var_info_after_build[i]) { + auto var_name = out_var_info_before_build[i].name_; + if (following_input_vars.count(var_name)) { + PADDLE_THROW(phi::errors::PreconditionNotMet( + "The output %s s' dtype/place of conditional_block is " + "changed after static build. Befer static build, the " + "dtype is %s, place is %s. After static " + "build, the dtype is %s, place is %s.", + var_name, + out_var_info_before_build[i].dtype_, + out_var_info_before_build[i].place_, + out_var_info_after_build[i].dtype_, + out_var_info_after_build[i].place_)); + } + } + } + } else if (op_type == "read") { const std::string& reader_name = op.Input("Reader"); framework::ReaderHolder* reader = GET_DATA_SAFELY(scope->FindVar(reader_name), "Input", "Reader", "Read") @@ -448,6 +620,18 @@ void FakeInitializeOutputsForFunctionKernel( if (beta1_pow->place() == beta2_pow->place()) { backend = phi::TransToPhiBackend(beta1_pow->place()); } + } else if (op_type == "lamb") { + phi::TensorBase* beta1_pow = GetTensorFormVar( + runtime_ctx.inputs.find("Beta1Pow")->second.at(0)); + phi::TensorBase* beta2_pow = GetTensorFormVar( + runtime_ctx.inputs.find("Beta2Pow")->second.at(0)); + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU && + beta1_pow->place().GetType() == AllocationType::CPU && + beta2_pow->place().GetType() == AllocationType::CPU) { + backend = phi::Backend::CPU; + } else { + backend = phi::TransToPhiBackend(dev_ctx.GetPlace()); + } } else if (op_type == "reshape2") { phi::TensorBase* x = GetTensorFormVar(runtime_ctx.inputs.find("X")->second.at(0)); diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.h b/paddle/fluid/framework/new_executor/interpreter/static_build.h index e070f66b02549..302d612bc0311 100644 --- a/paddle/fluid/framework/new_executor/interpreter/static_build.h +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.h @@ -23,11 +23,39 @@ namespace paddle { namespace framework { namespace interpreter { +struct VarMetaInfo { + std::string name_; + phi::DataType dtype_; + phi::Place place_; + + explicit VarMetaInfo(const std::string& name) : name_(name) { + dtype_ = phi::DataType::UNDEFINED; + place_ = phi::Place(); + } + + VarMetaInfo(const std::string& name, + const phi::DataType& dtype, + const platform::Place& place) + : name_(name), dtype_(dtype), place_(place) {} + + bool operator==(const VarMetaInfo& other) const { + return name_ == other.name_ && dtype_ == other.dtype_ && + place_ == other.place_; + } + + bool operator!=(const VarMetaInfo& other) const { + return name_ != other.name_ || dtype_ != other.dtype_ || + place_ != other.place_; + } +}; + bool BlockCanBeStaticBuilt(const framework::BlockDesc& block); -void FakeInitializeOutputsForOperatorBase(const OperatorBase& op, - const platform::Place& place, - Scope* scope); +void FakeInitializeOutputsForOperatorBase( + const OperatorBase& op, + const phi::Place& place, + Scope* scope, + std::vector> following_ops); void FakeInitializeOutputsForFunctionKernel( const framework::OperatorBase& op, @@ -40,6 +68,10 @@ void FakeInitializeOutputsForStructureKernel( const framework::OpKernelType& op_kernel_type, ExecutionContext* execution_context); +std::vector GetVarsInfo(const Scope* scope, + VariableNameMap var_map, + const OperatorBase& op); + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 2c030ef1dc264..369216e0078c4 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -97,6 +97,12 @@ class InterpreterBaseImpl { virtual std::shared_ptr> GetDependencyCount() const = 0; virtual bool IsSharedResultsBuild() const = 0; + + virtual void Build( + const std::vector& feed_names, + std::vector* op_func_nodes) = 0; + + virtual bool IsStaticBuild() const = 0; }; inline void SetDeviceId(const platform::Place& place) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index dc8110331a176..8e052d3b2685e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -121,5 +121,13 @@ void InterpreterCore::SetOutputHooks(const std::vector& hookfuncs) { impl_->SetOutputHooks(hookfuncs); } +void InterpreterCore::Build( + const std::vector& feed_names, + std::vector* op_func_nodes) { + impl_->Build(feed_names, op_func_nodes); +} + +bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 47f2d9c6a3378..d21bd9e1fc378 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -74,6 +74,11 @@ class InterpreterCore { void SetOutputHooks(const std::vector& hookfuncs); + void Build(const std::vector& feed_names, + std::vector* op_func_nodes); + + bool IsStaticBuild() const; + private: DISABLE_COPY_AND_ASSIGN(InterpreterCore); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 6b6cabb991382..55f70a573a1bc 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1315,6 +1315,13 @@ void NewIRInterpreter::PreAnalysis() { VLOG(4) << "Done UpdateNcclOpNum"; } +void NewIRInterpreter::Build( + const std::vector& feed_names, + std::vector* op_func_nodes) { + PADDLE_THROW(platform::errors::Unimplemented( + "Build is not implemented in NewIRInterpreter.")); +} + ::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) { for (auto kv : value_2_var_name_) { if (kv.second == var_name) { diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index cf5cb21ce81aa..c05eb6770b2ba 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -100,6 +100,12 @@ class NewIRInterpreter : public InterpreterBaseImpl { void CheckCUDAGraphBeforeRun(const std::vector& feed_names); void PrepareForCUDAGraphCapture(); + void Build( + const std::vector& feed_names, + std::vector* op_func_nodes) override; + + bool IsStaticBuild() const override { return static_build_; } + // workqueue std::shared_ptr GetWorkQueue(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index a29e45515d894..1384a9fb487de 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -52,10 +52,6 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, var_scope_(scope) { VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_; - static_build_ = FLAGS_new_executor_static_build && - !FLAGS_new_executor_use_cuda_graph && - interpreter::BlockCanBeStaticBuilt(block); - exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); @@ -73,6 +69,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, } var_scope_.SetLocalScope(local_scope_); + static_build_ = FLAGS_new_executor_static_build && + !FLAGS_new_executor_use_cuda_graph && + interpreter::BlockCanBeStaticBuilt(block); + instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) { SchedulingPriority lhs_scheduling_priority = vec_instruction_[lhs].GetSchedulingPriority(); @@ -129,28 +129,10 @@ void ProgramInterpreter::RunImpl() { FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool need_fetch) { - SetDeviceId(place_); - CheckCUDAGraphBeforeRun(feed_names); - -#ifdef PADDLE_WITH_DNNL - platform::AttachPointerHashToMKLDNNKey(this, place_); -#endif + std::vector op_func_nodes; + Build(feed_names, &op_func_nodes); if (!is_build_) { - LOG_FIRST_N(INFO, 1) << "New Executor is Running."; - paddle::framework::interpreter::BuildVariableScope( - block_, execution_config_, &var_scope_); - - std::vector op_func_nodes; - paddle::framework::interpreter::BuildOpFuncList( - place_, - block_, - execution_config_.skip_gc_vars, - &op_func_nodes, - &var_scope_, - execution_config_, - HasLocalScope(), - static_build_); SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); @@ -189,6 +171,33 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, } } +void ProgramInterpreter::Build( + const std::vector& feed_names, + std::vector* op_func_nodes) { + SetDeviceId(place_); + CheckCUDAGraphBeforeRun(feed_names); + +#ifdef PADDLE_WITH_DNNL + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif + + if (!is_build_) { + LOG_FIRST_N(INFO, 1) << "New Executor is Running."; + paddle::framework::interpreter::BuildVariableScope( + block_, execution_config_, &var_scope_); + + paddle::framework::interpreter::BuildOpFuncList( + place_, + block_, + execution_config_.skip_gc_vars, + op_func_nodes, + &var_scope_, + execution_config_, + HasLocalScope(), + static_build_); + } +} + FetchList ProgramInterpreter::Run( const std::vector& feed_names, const std::vector& feed_tensors) { diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 27348d57fcd17..bef6385c211fb 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -48,6 +48,10 @@ class ProgramInterpreter : public InterpreterBaseImpl { paddle::framework::FetchList Run(const std::vector& feed_names, bool need_fetch = true) override; + void Build( + const std::vector& feed_names, + std::vector* op_func_nodes) override; + void ShareWorkQueueFrom(InterpreterBaseImpl* src) override; void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override; @@ -92,7 +96,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { force_evnets_to_wait_ = force_evnets_to_wait; } - bool IsStaticBuild() const { return static_build_; } + bool IsStaticBuild() const override { return static_build_; } private: // build graph diff --git a/paddle/phi/kernels/gpu/lamb_kernel.cu b/paddle/phi/kernels/gpu/lamb_kernel.cu index 220fa97a0e107..c1d1a812a881e 100644 --- a/paddle/phi/kernels/gpu/lamb_kernel.cu +++ b/paddle/phi/kernels/gpu/lamb_kernel.cu @@ -33,4 +33,6 @@ PD_REGISTER_KERNEL(lamb, kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(4).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(5).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); + kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); } diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 5e000112784aa..9e7adef0a634f 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1272,6 +1272,10 @@ set_tests_properties( set_tests_properties( test_cuda_graph_static_mode_error PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1") +# In test_conditional_block, the sub block changes the dtype and place of the output variable. +# The changed variable is used in the following op. Static build is not supported for this case. +set_tests_properties(test_conditional_block + PROPERTIES ENVIRONMENT "FLAGS_new_executor_static_build=0") # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default. set(STATIC_BUILD_TESTS diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list index 613769ec5b657..b85c88fa6bb18 100644 --- a/test/white_list/new_ir_op_test_white_list +++ b/test/white_list/new_ir_op_test_white_list @@ -116,8 +116,6 @@ test_kron_op test_kthvalue_op test_label_smooth_op test_label_smooth_op_new_ir -test_lamb_op -test_lamb_op_static_build test_lerp_op test_lgamma_op test_linear_interp_v2_op From 33d8ee204897a27ccbbb81a052b81cd1dbdf04fe Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 21 Sep 2023 09:55:35 +0800 Subject: [PATCH 09/39] [Pir] Support Run with feed_tensor (#57497) * refine * add flag * add ut --- .../new_executor/new_ir_interpreter.cc | 115 +++++++++++++++++- test/cpp/new_executor/CMakeLists.txt | 1 + .../standalone_executor_new_ir_test.cc | 81 ++++++++++++ 3 files changed, 195 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 55f70a573a1bc..47823eb82b428 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -935,8 +935,108 @@ void NewIRInterpreter::ConstructEventForJitInput() { paddle::framework::FetchList NewIRInterpreter::Run( const std::vector& feed_names, const std::vector& feed_tensors) { - PADDLE_THROW(platform::errors::Unimplemented( - "Run with feed_tensors is not implemented in NewIRInterpreter.")); + auto FeedInput = [&] { + VLOG(4) << "Feed inputs"; + for (size_t i = 0; i < feed_names.size(); ++i) { + auto* feed_var = InnerScope()->FindVar(feed_names[i]); + PADDLE_ENFORCE_NOT_NULL( + feed_var, + platform::errors::NotFound("Variable %s should not be nullptr.", + feed_names[i])); + + auto feed_tensor = feed_var->GetMutable(); + feed_tensor->ShareDataWith(feed_tensors[i]); + feed_tensor->set_lod(feed_tensors[i].lod()); + } + }; + + SetDeviceId(place_); + CheckCUDAGraphBeforeRun(feed_names); + +#ifdef PADDLE_WITH_DNNL + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif + + FeedInput(); + + if (!is_build_) { + LOG_FIRST_N(INFO, 1) << "New Executor is BetaRunning."; + // Build + VLOG(4) << "Done BuildScope"; + VLOG(4) << DebugValueInfo(); + + SolvePersisableVarNames(); + + VLOG(4) << "Parameter value include: "; + for (auto parameter : parameter_var_names_) { + VLOG(4) << "Parameter value: " << parameter; + } + + BuildInstruction(); + VLOG(4) << "Done BuildInstruction"; + + PreAnalysis(); + VLOG(4) << "Done PreAnalysis"; + + // Run + if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 || + ((execution_config_.used_for_jit || execution_config_.used_for_cinn) && + (sync_op_num_ == 0))) { + LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode " + "with trace version."; + TraceRunImpl(); + } else { + LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode " + "with multi thread version."; + MultiThreadRunImpl(); + } + + is_build_ = true; + is_shared_results_build_ = true; + } else { + if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 || + ((execution_config_.used_for_jit || execution_config_.used_for_cinn) && + (sync_op_num_ == 0))) { + TraceRunImpl(); + } else { + MultiThreadRunImpl(); + } + } + + if (HasLocalScope()) { + ClearLoDTensorArrayInLocalScope(); + } + // return Fetch Tensors + Scope* inner_scope = InnerScope(); + if (FLAGS_enable_new_ir_in_executor) { + framework::FetchList fetch_res; + + for (auto& var_name : fetch_var_names_) { + auto* var = inner_scope->FindVar(var_name); + VLOG(0) << "fetch " << var_name << "[" << var << "]"; + fetch_res.push_back(var->Get()); + } + + VLOG(4) << "get fetch list size: " << fetch_res.size(); + return fetch_res; + } else { + auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); + if (fetch_var) { + auto fetch_list = + std::move(*fetch_var->GetMutable()); +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_list.empty(), + true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + } +#endif + return fetch_list; + } else { + return {}; + } + } } FetchList NewIRInterpreter::Run(const std::vector& feed_names, @@ -1252,6 +1352,16 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) { VLOG(4) << "begin to run op " << instr_node->Name(); if (!instr_node->IsArtificial()) { instr_node->Run(); + + if (FLAGS_benchmark) { + instr_node->DeviceContext().Wait(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); + VLOG(4) << "Operator(" << instr_node->Name() // NOLINT + << "): context wait and get last error"; +#endif + } + VLOG(4) << __func__ << " OP id:" << instr_node->Id() << " name:" << instr_node->Name() << " type:" << (instr_node->KernelType() == OpFuncType::kCpuSync @@ -1260,6 +1370,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) { ? "kGpuSync" : "kGpuAsync")) << " runs on " << platform::GetCurrentThreadName(); + VLOG(4) << "done instruction node run"; CheckGC(instr_node); VLOG(4) << "done CheckGC"; diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt index 00285e39f518b..af09520b12a54 100644 --- a/test/cpp/new_executor/CMakeLists.txt +++ b/test/cpp/new_executor/CMakeLists.txt @@ -10,6 +10,7 @@ if(NOT WIN32) pd_op_dialect pd_kernel_dialect pir + phi standalone_executor) endif() diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc index d200b2a1052ed..eac996ffebe0f 100644 --- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc +++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc @@ -97,6 +97,87 @@ TEST(StandaloneExecutor, run) { EXPECT_EQ(res3, true); } +TEST(StandaloneExecutor, run_feed_tensor) { + pir::IrContext* ctx = pir::IrContext::Instance(); + pir::Program program(ctx); + + ctx->GetOrRegisterDialect(); + + pir::Builder builder = pir::Builder(ctx, program.block()); + + pir::OpInfo feed_op_info = + ctx->GetRegisteredOpInfo(paddle::dialect::FeedOp::name()); + + pir::Type fp32_dtype = pir::Float32Type::get(ctx); + phi::DDim dims = {1}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0}}; + size_t offset = 0; + pir::Type dense_tensor_dtype = paddle::dialect::DenseTensorType::get( + ctx, fp32_dtype, dims, data_layout, lod, offset); + + pir::AttributeMap attr_map1; + attr_map1.insert(std::pair( + "name", pir::StrAttribute::get(ctx, "x"))); + attr_map1.insert(std::pair( + "col", pir::Int32Attribute::get(ctx, 0))); + pir::Operation* feed_op1 = + pir::Operation::Create({}, attr_map1, {dense_tensor_dtype}, feed_op_info); + program.block()->push_back(feed_op1); + + pir::AttributeMap attr_map2; + attr_map2.insert(std::pair( + "name", pir::StrAttribute::get(ctx, "y"))); + attr_map2.insert(std::pair( + "col", pir::Int32Attribute::get(ctx, 0))); + pir::Operation* feed_op2 = + pir::Operation::Create({}, attr_map2, {dense_tensor_dtype}, feed_op_info); + program.block()->push_back(feed_op2); + + builder.Build(feed_op1->result(0), + feed_op2->result(0)); + + auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program); + + auto place = platform::CPUPlace(); + Scope scope; + InterpreterCore test_core(place, {}, kernel_program->block(), &scope); + + std::stringstream os; + os << reinterpret_cast( + const_cast(test_core.Impl())); + std::string out_name = os.str() + "_inner_var_2"; + test_core.SetSkipGcVars({out_name}); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, dims, data_layout, lod, offset); + paddle::platform::DeviceContext* dev_ctx = + paddle::platform::DeviceContextPool::Instance().Get( + paddle::platform::CPUPlace()); + + phi::DenseTensor tensor_x; + tensor_x.set_meta(meta); + dev_ctx->Alloc(&tensor_x, phi::DataType::FLOAT32); + float* tensor_x_data = tensor_x.data(); + *tensor_x_data = 1.0; + + phi::DenseTensor tensor_y; + tensor_y.set_meta(meta); + dev_ctx->Alloc(&tensor_y, phi::DataType::FLOAT32); + float* tensor_y_data = tensor_y.data(); + *tensor_y_data = 2.0; + + test_core.Run({"x", "y"}, {tensor_x, tensor_y}); + + auto out_tensor = + test_core.local_scope() == nullptr + ? scope.FindVar(out_name)->Get() + : test_core.local_scope()->FindVar(out_name)->Get(); + + bool res0 = simple_cmp(out_tensor.data()[0], 3.0); + EXPECT_EQ(res0, true); +} + TEST(StandaloneExecutor, run_inplace_sqrt) { pir::IrContext* ctx = pir::IrContext::Instance(); pir::Program program((ctx)); From 2e5a6fbadef0fe215f08baba15dcecdf8039c7c6 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:12:09 +0800 Subject: [PATCH 10/39] [Pir] delete support mutable attribute for pow (#57503) * refien * fix bug * fix * refine --- .../fluid/pir/dialect/op_generator/api_gen.py | 8 +++++ .../fluid/pir/dialect/op_generator/op_gen.py | 8 +++++ paddle/fluid/primitive/codegen/gen.py | 2 +- test/legacy_test/test_activation_op.py | 32 ++----------------- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py index 5a3afdf2036a9..d7e74f72b652f 100644 --- a/paddle/fluid/pir/dialect/op_generator/api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py @@ -125,6 +125,14 @@ def _parse_yaml(self, op_yaml_files, op_compat_yaml_file): op_compat_item = op_compat_parser.get_compat( op['forward']['name'] ) + + if ( + op_compat_item is not None + and op_compat_item['op'] == "pow" + and 'scalar' in op_compat_item + ): + op_compat_item = op_compat_item.pop('scalar') + op_info_items.append(OpInfoParser(op, op_compat_item)) return op_info_items diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py index 62e746044776d..46949bcb547a7 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py @@ -858,6 +858,14 @@ def OpGenerator( and 'forward' in op ): op_compat_item = op_compat_parser.get_compat(op['forward']['name']) + + if ( + op_compat_item is not None + and op_compat_item['op'] == "pow" + and 'scalar' in op_compat_item + ): + op_compat_item = op_compat_item.pop('scalar') + op_info_items[op['name']] = OpInfoParser(op, op_compat_item) # (3) CodeGen: Traverse op_info_items and generate ops_name_list = [] # all op class name store in this list diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py index 0239f3d702e96..f9a920730967d 100644 --- a/paddle/fluid/primitive/codegen/gen.py +++ b/paddle/fluid/primitive/codegen/gen.py @@ -291,7 +291,7 @@ def extend_compat_info(apis, compats): backward_apis.append(apis_dict[backward_op_name]) support_tensor_attrs_names = [] compat_attrs_data_type = {} - if 'scalar' in compat_item: + if 'scalar' in compat_item and compat_item['op'] != "pow": for attr_name, attr_info in compat_item['scalar'].items(): if ( 'support_tensor' in attr_info diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 703cc4174d8f5..8b16ee5750eac 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -3588,33 +3588,7 @@ def init_shape(self): self.shape = [] -class TestPow_factor_tensor(TestActivation): - def setUp(self): - self.op_type = "pow" - self.python_api = paddle.pow - self.enable_cinn = False - self.init_dtype() - - np.random.seed(1024) - x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - out = np.power(x, 3) - - self.inputs = { - 'X': OpTest.np_dtype_to_base_dtype(x), - 'FactorTensor': np.array([3.0]).astype(self.dtype), - } - - self.attrs = {} - self.outputs = {'Out': out} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad(['X'], 'Out') - +class TestPow_API(TestActivation): def test_api(self): with static_guard(): input = np.random.uniform(1, 2, [11, 17]).astype("float32") @@ -4526,7 +4500,7 @@ def test_check_grad(self): create_test_act_fp16_class(TestLog1p) create_test_act_fp16_class(TestSquare) create_test_act_fp16_class(TestPow, check_prim=True) -create_test_act_fp16_class(TestPow_factor_tensor) +create_test_act_fp16_class(TestPow_API) create_test_act_fp16_class(TestSTanh) create_test_act_fp16_class(TestSoftplus) create_test_act_fp16_class(TestSoftsign) @@ -4657,7 +4631,7 @@ def test_check_grad(self): create_test_act_bf16_class(TestLog1p) create_test_act_bf16_class(TestSquare) create_test_act_bf16_class(TestPow, check_prim=True) -create_test_act_bf16_class(TestPow_factor_tensor) +create_test_act_bf16_class(TestPow_API) create_test_act_bf16_class(TestSTanh) create_test_act_bf16_class(TestSoftplus) create_test_act_bf16_class(TestSoftsign) From 00bd3aa99f33add638f567998b74e07323e2b2b9 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 21 Sep 2023 10:29:31 +0800 Subject: [PATCH 11/39] update get/set parameter (#57539) --- .../pir/dialect/operator/ir/api_builder.cc | 11 ++++++++++ .../pir/dialect/operator/ir/api_builder.h | 6 +++++ .../pir/dialect/operator/ir/manual_api.cc | 21 ++++++------------ .../pir/dialect/operator/ir/manual_api.h | 4 +--- paddle/fluid/pybind/ir.cc | 15 +++++++++++++ .../fluid/pybind/manual_static_op_function.h | 7 +----- python/paddle/base/data_feeder.py | 2 +- python/paddle/base/executor.py | 11 +++++----- python/paddle/ir/core.py | 22 +++++++++++++------ python/paddle/nn/initializer/constant.py | 7 +++++- python/paddle/nn/initializer/xavier.py | 13 ++++++----- python/paddle/tensor/math.py | 2 -- test/ir/new_ir/test_build_model.py | 12 +++++----- 13 files changed, 82 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc index 893c664b78b08..0662ced1cb40c 100644 --- a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc +++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc @@ -48,5 +48,16 @@ void APIBuilder::ResetInsertionPointToEnd() { builder_->SetInsertionPointToEnd(builder_->block()); } +pir::Parameter* APIBuilder::GetParameter(const std::string& name) const { + pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram(); + return program->GetParameter(name); +} + +void APIBuilder::SetParameter(const std::string& name, + std::unique_ptr&& parameter) { + pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram(); + program->SetParameter(name, std::move(parameter)); +} + } // namespace dialect } // namespace paddle diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h index a06f529d2c5be..060102de4bde0 100644 --- a/paddle/fluid/pir/dialect/operator/ir/api_builder.h +++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h @@ -17,6 +17,7 @@ #include "paddle/pir/core/builder.h" #include "paddle/pir/core/macros.h" +#include "paddle/pir/core/parameter.h" #include "paddle/pir/core/program.h" namespace paddle { @@ -40,6 +41,11 @@ class APIBuilder { void ResetInsertionPointToEnd(); + pir::Parameter* GetParameter(const std::string& name) const; + + void SetParameter(const std::string& name, + std::unique_ptr&& parameter); + std::shared_ptr GetBuilder() { return builder_; } private: diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc index ba8fc47744ed3..24e7a94b66650 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_api.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/core/builtin_op.h" - +#include "paddle/pir/core/parameter.h" namespace paddle { namespace dialect { @@ -46,25 +46,18 @@ pir::OpResult zeros_like(pir::Value x, return paddle::dialect::full_like(x, 0, dtype, place); } -pir::OpResult get_parameter(const std::string& name, - phi::DataType dtype, - const std::vector& shape) { - phi::LoD lod; - size_t offset{0}; - pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get( - pir::IrContext::Instance(), - TransToIrDataType(dtype), - phi::DDim(shape.data(), shape.size()), - phi::DataLayout::UNDEFINED, - lod, - offset); +pir::OpResult get_parameter(const std::string& name) { + pir::Parameter* param = APIBuilder::Instance().GetParameter(name); pir::GetParameterOp get_parameter_op = APIBuilder::Instance().GetBuilder()->Build( - name, out_dense_tensor_type); + name, param->type()); return get_parameter_op.result(0); } void set_parameter(pir::Value parameter, const std::string& name) { + std::unique_ptr param( + new pir::Parameter(nullptr, 0, parameter.type())); + APIBuilder::Instance().SetParameter(name, std::move(param)); APIBuilder::Instance().GetBuilder()->Build(parameter, name); } diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h index 7e5aba6fcbaa8..c919448f1ddb0 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h @@ -32,9 +32,7 @@ pir::OpResult zeros_like(pir::Value x, phi::DataType dtype = phi::DataType::UNDEFINED, const Place& place = {}); -pir::OpResult get_parameter(const std::string& name, - phi::DataType dtype, - const std::vector& shape); +pir::OpResult get_parameter(const std::string& name); void set_parameter(pir::Value parameter, const std::string& name); diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index db3faebb1985b..913d7d6f7aa80 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -153,6 +153,11 @@ void BindProgram(py::module *m) { [](const std::shared_ptr &self) { return self->parameters_num(); }) + .def("move_parameters_from", + [](const std::shared_ptr &self, + const std::shared_ptr &other) { + self->set_parameters(std::move(other->parameters())); + }) .def( "global_block", [](std::shared_ptr self) { return self->block(); }, @@ -375,9 +380,19 @@ void BindOpOperand(py::module *m) { bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name) { auto *defining_op = self.owner(); if (defining_op->HasAttribute(attr_name)) { + PADDLE_ENFORCE( + defining_op->attribute(attr_name).isa(), + paddle::platform::errors::InvalidArgument( + "%s: Callstack attributes of %s is not ArrayAttribute type", + attr_name)); auto attrs = defining_op->attribute(attr_name) .dyn_cast() .AsVector(); + PADDLE_ENFORCE(attrs[self.index()].isa(), + paddle::platform::errors::InvalidArgument( + "The index %d in %s is not BoolAttribute type", + self.index(), + attr_name)); return attrs[self.index()].dyn_cast().data(); } else { return true; diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h index 68b9e22ec7f94..7c32b2ab1d4fa 100644 --- a/paddle/fluid/pybind/manual_static_op_function.h +++ b/paddle/fluid/pybind/manual_static_op_function.h @@ -35,13 +35,8 @@ static PyObject *static_api_get_parameter(PyObject *self, // Parse Attributes PyObject *name_obj = PyTuple_GET_ITEM(args, 0); std::string name = CastPyArg2String(name_obj, "name", 0); - PyObject *dtype_obj = PyTuple_GET_ITEM(args, 1); - phi::DataType dtype = CastPyArg2DataTypeDirectly(dtype_obj, "dtype", 1); - PyObject *shape_obj = PyTuple_GET_ITEM(args, 2); - phi::IntArray shape = CastPyArg2IntArray(shape_obj, "shape", 2); // Call ir static api - auto static_api_out = - paddle::dialect::get_parameter(name, dtype, shape.GetData()); + auto static_api_out = paddle::dialect::get_parameter(name); return ToPyObject(static_api_out); } catch (...) { diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py index 40154e1a0d429..78781a6856af1 100644 --- a/python/paddle/base/data_feeder.py +++ b/python/paddle/base/data_feeder.py @@ -47,7 +47,7 @@ _PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE = { core.DataType.BOOL: 'bool', core.DataType.FLOAT16: 'float16', - core.DataType.UINT16: 'uint16', + core.DataType.BFLOAT16: 'uint16', core.DataType.FLOAT32: 'float32', core.DataType.FLOAT64: 'float64', core.DataType.INT8: 'int8', diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index 9ea3d566c824a..e5fddd15329e3 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -515,11 +515,12 @@ def _add_pir_fetch_ops(program, fetch_list, fetch_var_name): if not has_fetch_operations( global_block, fetch_list, fetch_var_name, fetch_op ): - for i, fetch_input in enumerate(fetch_list): - assert isinstance( - fetch_input, OpResult - ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input)) - paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i) + with paddle.static.program_guard(program): + for i, fetch_input in enumerate(fetch_list): + assert isinstance( + fetch_input, OpResult + ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input)) + paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i) def _merge_tensors(tensor, micro_batch_num): diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py index 0ce01ebb3f593..908319458ed39 100644 --- a/python/paddle/ir/core.py +++ b/python/paddle/ir/core.py @@ -251,6 +251,12 @@ def program_guard(main_program, startup_program=None): switch_startup_program(startup_program) +class ParameterMeta: + def __init__(self, shape, dtype): + self.shape = shape + self.dtype = dtype + + def create_parameter( dtype, shape, @@ -266,19 +272,21 @@ def create_parameter( op_result_name = unique_name.generate('parameter') startup_program = default_startup_program() main_program = default_main_program() - - with program_guard(default_main_program()): - param = get_parameter(op_result_name, dtype, shape) - trainable = kwargs.get('trainable', True) - param.stop_gradient = not trainable - param.is_persistable = True + parameter_meta = ParameterMeta(shape, dtype) with program_guard(startup_program): initializer = kwargs['initializer'] init_result = initializer( - param, param.get_defining_op().get_parent_block() + parameter_meta, startup_program.global_block() ) init_result.is_persistable = True set_parameter(init_result, op_result_name) + main_program.move_parameters_from(startup_program) + with program_guard(default_main_program()): + param = get_parameter(op_result_name, dtype, shape) + trainable = kwargs.get('trainable', True) + param.stop_gradient = not trainable + param.is_persistable = True + return param diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py index bc2baf08c9bb1..b4e9ee1df266a 100644 --- a/python/paddle/nn/initializer/constant.py +++ b/python/paddle/nn/initializer/constant.py @@ -58,7 +58,12 @@ def forward(self, var, block=None): assert isinstance( var, - (framework.Variable, framework.EagerParamBase, paddle.ir.OpResult), + ( + framework.Variable, + framework.EagerParamBase, + paddle.ir.OpResult, + paddle.ir.core.ParameterMeta, + ), ) assert isinstance(block, (framework.Block, paddle.ir.Block)) diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py index 7f479111fba3d..40eb6a874c9da 100644 --- a/python/paddle/nn/initializer/xavier.py +++ b/python/paddle/nn/initializer/xavier.py @@ -88,12 +88,13 @@ def forward(self, var, block=None): block = self._check_block(block) assert isinstance(block, (framework.Block, paddle.ir.Block)) - check_variable_and_dtype( - var, - "Out", - ["uint16", "float16", "float32", "float64"], - "xavier_init", - ) + if not isinstance(var, paddle.ir.core.ParameterMeta): + check_variable_and_dtype( + var, + "Out", + ["uint16", "float16", "float32", "float64"], + "xavier_init", + ) f_in, f_out = self._compute_fans(var) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 5cdd91b075426..56c553bce797e 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -889,8 +889,6 @@ def divide(x, y, name=None): """ if in_dynamic_or_pir_mode(): return _C_ops.divide(x, y) - elif in_pir_mode(): - return paddle._ir_ops.divide(x, y) else: return _elementwise_op(LayerHelper('elementwise_div', **locals())) diff --git a/test/ir/new_ir/test_build_model.py b/test/ir/new_ir/test_build_model.py index f356cfc24ffdf..a6ddae7c443ea 100644 --- a/test/ir/new_ir/test_build_model.py +++ b/test/ir/new_ir/test_build_model.py @@ -31,12 +31,12 @@ def test_basic_network(self): exe = paddle.static.Executor() x_feed = np.ones([4, 4], dtype=np.float32) * 10 y_feed = np.ones([4, 4], dtype=np.float32) * 2 - (sum_value,) = exe.run( - main_program, - feed={'x': x_feed, 'y': y_feed}, - fetch_list=[sum_out], - ) - self.assertEqual(sum_value, 5 * 4 * 4) + (sum_value,) = exe.run( + main_program, + feed={'x': x_feed, 'y': y_feed}, + fetch_list=[sum_out], + ) + self.assertEqual(sum_value, 5 * 4 * 4) main_program = paddle.static.Program() with paddle.static.program_guard(main_program): From 47040ef6c6df4b95617a58636b3c13ab64112a5a Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:31:27 +0800 Subject: [PATCH 12/39] add all cast newir test (#57527) --- test/legacy_test/test_cast_op.py | 42 ++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py index e24eb6b44b631..47bc23d76f601 100644 --- a/test/legacy_test/test_cast_op.py +++ b/test/legacy_test/test_cast_op.py @@ -78,10 +78,16 @@ def setUp(self): self.public_python_api = cast_wrapper def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_grad(self): - self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True) + self.check_grad( + ['X'], + ['Out'], + check_prim=True, + only_check_prim=True, + check_new_ir=True, + ) class TestCastOpFp32ToFp16(OpTest): @@ -99,10 +105,16 @@ def setUp(self): self.public_python_api = cast_wrapper def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_grad(self): - self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True) + self.check_grad( + ['X'], + ['Out'], + check_prim=True, + only_check_prim=True, + check_new_ir=True, + ) @unittest.skipIf( @@ -128,10 +140,16 @@ def if_enable_cinn(self): self.enable_cinn = False def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_grad(self): - self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True) + self.check_grad( + ['X'], + ['Out'], + check_prim=True, + only_check_prim=True, + check_new_ir=True, + ) @unittest.skipIf( @@ -157,20 +175,28 @@ def if_enable_cinn(self): self.enable_cinn = False def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_grad(self): - self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True) + self.check_grad( + ['X'], + ['Out'], + check_prim=True, + only_check_prim=True, + check_new_ir=True, + ) class TestCastOpError(unittest.TestCase): def test_errors(self): + paddle.enable_static() with program_guard(Program(), Program()): # The input type of cast_op must be Variable. x1 = base.create_lod_tensor( np.array([[-1]]), [[1]], base.CPUPlace() ) self.assertRaises(TypeError, paddle.cast, x1, 'int32') + paddle.disable_static() class TestCastOpEager(unittest.TestCase): From 7bf03d344d53dd45ca23611d9de342e1e95c67d5 Mon Sep 17 00:00:00 2001 From: chen2016013 <111894720+chen2016013@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:43:39 +0800 Subject: [PATCH 13/39] [PIR] Print value info on python (#57471) * fix bug * rewrite __str__ in value and opresult to print info * fix bug * change as reviewed comments * change as reviewed comments * fix print str --- paddle/fluid/pybind/ir.cc | 37 +++++++++++++++++++++++++++++++- paddle/pir/core/ir_printer.cc | 5 +++++ paddle/pir/core/value.h | 2 ++ test/ir/new_ir/test_ir_pybind.py | 10 ++++++++- 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 913d7d6f7aa80..22fd0f40a36b5 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" #include "paddle/fluid/ir_adaptor/translator/utils.h" +#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" @@ -91,6 +92,20 @@ inline void SetProgramInt64Attr(std::shared_ptr program, attr_name, pir::Int64Attribute::get(pir::IrContext::Instance(), value)); } +std::string GetValueInfo(Value v) { + std::stringstream ss; + ss << "define_op_name=" << v.dyn_cast().owner()->name(); + ss << ", index=" << v.dyn_cast().index(); + ss << ", dtype=" << v.type(); + if (v.type().isa()) { + ss << ", place=" + << v.type() + .dyn_cast() + .place(); + } + return ss.str(); +} + void BindProgram(py::module *m) { py::class_> program(*m, "Program", R"DOC( Create Python Program. Program is an abstraction of model structure, divided into @@ -353,7 +368,14 @@ void BindValue(py::module *m) { return self.impl() == other.Value::impl(); }) .def("__hash__", - [](const Value &self) { return std::hash{}(self); }); + [](const Value &self) { return std::hash{}(self); }) + .def("__str__", [](const Value &self) -> py::str { + std::ostringstream print_stream; + print_stream << "Value("; + print_stream << GetValueInfo(self); + print_stream << ")"; + return print_stream.str(); + }); } void BindOpOperand(py::module *m) { @@ -472,6 +494,19 @@ void BindOpResult(py::module *m) { }) .def("__hash__", [](OpResult &self) { return std::hash{}(self); }) + .def("__str__", + [](OpResult &self) -> py::str { + std::ostringstream print_stream; + print_stream << "OpResult("; + print_stream << GetValueInfo(self); + if (GetOpResultBoolAttr(self, kAttrStopGradients)) { + print_stream << ", stop_gradient=True"; + } else { + print_stream << ", stop_gradient=False"; + } + print_stream << ")"; + return print_stream.str(); + }) .def( "get_defining_op", [](const OpResult &self) -> pir::Operation * { diff --git a/paddle/pir/core/ir_printer.cc b/paddle/pir/core/ir_printer.cc index 52c49be812104..260d42e035e4d 100644 --- a/paddle/pir/core/ir_printer.cc +++ b/paddle/pir/core/ir_printer.cc @@ -317,6 +317,11 @@ void Operation::Print(std::ostream& os) { printer.PrintOperation(this); } +void Value::Print(std::ostream& os) const { + IrPrinter printer(os); + printer.PrintValue(*this); +} + void Type::Print(std::ostream& os) const { BasicIrPrinter printer(os); printer.PrintType(*this); diff --git a/paddle/pir/core/value.h b/paddle/pir/core/value.h index 81a1717540e3d..00c7aa123746e 100644 --- a/paddle/pir/core/value.h +++ b/paddle/pir/core/value.h @@ -72,6 +72,8 @@ class IR_API Value { OpOperand first_use() const; + void Print(std::ostream &os) const; + bool use_empty() const; bool HasOneUse() const; diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py index 34aa4c90c873f..b9a6fb92ac548 100644 --- a/test/ir/new_ir/test_ir_pybind.py +++ b/test/ir/new_ir/test_ir_pybind.py @@ -103,6 +103,11 @@ def test_value(self): ) # test value == opresult self.assertEqual(add_op.operands_source()[0], matmul_op.results()[0]) + # test opresult print + self.assertTrue( + 'dtype=pd_op.tensor<4x4xf32>' + in add_op.operands_source()[0].__str__() + ) # test opresult == value self.assertEqual( add_op.operands()[0].source(), add_op.operands_source()[0] @@ -110,10 +115,13 @@ def test_value(self): # test opresult == opresult self.assertEqual(add_op.operands()[0].source(), matmul_op.results()[0]) + # test opresult print self.assertEqual( tanh_op.operands()[0].source().get_defining_op().name(), "pd_op.add" ) - + self.assertTrue( + 'pd_op.tensor<4x4xf32>' in tanh_op.operands()[0].source().__str__() + ) add_op.replace_all_uses_with(matmul_op.results()) self.assertEqual( tanh_op.operands()[0].source().get_defining_op().name(), From 3fd69fa01736459182576d5c1916766f0e287714 Mon Sep 17 00:00:00 2001 From: Ruibin Cheung Date: Thu, 21 Sep 2023 10:54:50 +0800 Subject: [PATCH 14/39] [NewComm] No.10 compatiable upgrade for distributed_fused_lamb op (#57424) * [NewComm] No.10 compatiable upgrade for distributed_fused_lamb op * fix --- .../optimizers/distributed_fused_lamb_op.cu | 354 ++++++++++++++---- .../phi/core/distributed/nccl_comm_context.cc | 17 + .../phi/core/distributed/nccl_comm_context.h | 20 +- test/legacy_test/CMakeLists.txt | 4 +- .../distributed_fused_lamb_test_base.py | 5 +- ...est_distributed_fused_lamb_op_with_clip.py | 18 + ...buted_fused_lamb_op_with_gradient_merge.py | 17 + 7 files changed, 359 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index fdec898edbe91..a672f5ac99aa8 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -21,6 +21,7 @@ #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/cuda_stream.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" @@ -28,6 +29,14 @@ #include "paddle/phi/kernels/funcs/tensor_to_string.h" #include "paddle/utils/optional.h" +#include "paddle/fluid/distributed/collective/utils.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); +#endif + #ifdef __NVCC__ #include "cub/cub.cuh" #include "math.h" // NOLINT @@ -48,6 +57,19 @@ using MasterT = typename phi::dtype::MPTypeTrait::Type; using phi::funcs::FlattenToString; using phi::funcs::ToVector; +static void CheckCommContextHasRingId( + const distributed::CommContextManager &comm_context_manager, int ring_id) { + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + paddle::platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); +} + template static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { static_assert(!std::is_same::value, "T cannot be void."); @@ -875,24 +897,68 @@ static void MultiTensorUpdateLambParamAndBetaPows( } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, - ncclComm_t comm, - const void *scale, - ncclRedOp_t *op) { +static bool CreatePreMulScaleOpIfSupported( + ncclDataType_t dtype, + ncclComm_t comm, + const void *scale, + ncclRedOp_t *op, + distributed::NCCLCommContext *comm_ctx = nullptr) { #if NCCL_VERSION_CODE >= 21100 - int ver; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver)); - if (ver >= 21100) { - VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( - op, const_cast(scale), dtype, ncclScalarDevice, comm)); - return true; + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_NOT_NULL( + comm_ctx, + phi::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But parameter of comm_ctx should not be nullptr.")); + int ver = comm_ctx->GetNcclVersion(); + if (ver >= 21100) { + VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; + comm_ctx->RedOpCreatePreMulSum( + op, const_cast(scale), dtype, ncclScalarDevice); + return true; + } + } else { + int ver; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver)); + if (ver >= 21100) { + VLOG(10) << "ncclRedOpCreatePreMulSum is supported."; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( + op, const_cast(scale), dtype, ncclScalarDevice, comm)); + return true; + } } #endif VLOG(10) << "ncclRedOpCreatePreMulSum is not supported."; return false; } +static void DestoryOpIfSupported( + ncclRedOp_t op, + ncclComm_t comm, + distributed::NCCLCommContext *comm_ctx = nullptr) { +#if NCCL_VERSION_CODE >= 21100 + VLOG(10) << "ncclRedOpDestroy starts"; + + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_NOT_NULL( + comm_ctx, + phi::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But parameter of comm_ctx should not be nullptr.")); + comm_ctx->RedOpDestroy(op); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm)); + } + VLOG(10) << "ncclRedOpDestroy ends"; + +#endif + VLOG(10) << "ncclRedOpDestroy is not supported."; +} + template static void LaunchScaleKernel(const phi::GPUContext &dev_ctx, const T1 *x, @@ -922,7 +988,18 @@ static void NCCLSumWithScaleBase(const T *sendbuff, ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, + distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_NOT_NULL( + comm_ctx, + phi::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But parameter of comm_ctx should not be nullptr.")); + } + static_assert( std::is_same::value || std::is_same::value, "T must be either float32 or float16."); @@ -943,8 +1020,8 @@ static void NCCLSumWithScaleBase(const T *sendbuff, ncclRedOp_t op = ncclSum; ncclDataType_t dtype = std::is_same::value ? ncclFloat32 : ncclFloat16; - bool should_destroy_op = - scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op); + bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported( + dtype, comm, scale, &op, comm_ctx); memory_utils::Buffer buffer(dev_ctx.GetPlace()); if (scale && !should_destroy_op) { T *new_sendbuff = buffer.Alloc(numel); @@ -952,21 +1029,44 @@ static void NCCLSumWithScaleBase(const T *sendbuff, sendbuff = new_sendbuff; } - if (UseReduceScatter) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter( - sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); + if (comm_ctx) { + // Here assume comm_ctx->GetNcclComm() have higher priority than comm + if (UseReduceScatter) { + // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor, + // but sendbuff or recvbuff maybe allocated by Buffer. + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + dtype, + op, + comm_ctx->GetNcclComm(), + stream)); + } else { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but sendbuff or recvbuff maybe allocated by Buffer. + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllReduce(sendbuff, + recvbuff, + recvcount, + dtype, + op, + comm_ctx->GetNcclComm(), + stream)); + } } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); + if (UseReduceScatter) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter( + sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + sendbuff, recvbuff, recvcount, dtype, op, comm, stream)); + } } -#if NCCL_VERSION_CODE >= 21100 if (should_destroy_op) { - VLOG(10) << "ncclRedOpDestroy starts"; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm)); - VLOG(10) << "ncclRedOpDestroy ends"; + DestoryOpIfSupported(op, comm, comm_ctx); } -#endif } template @@ -977,9 +1077,17 @@ static void NCCLReduceScatterWithScale(const T *sendbuff, ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, + distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - NCCLSumWithScaleBase( - sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); + NCCLSumWithScaleBase(sendbuff, + recvbuff, + recvcount, + nranks, + comm, + stream, + dev_ctx, + comm_ctx, + scale); } template @@ -990,9 +1098,17 @@ static void NCCLAllReduceWithScale(const T *sendbuff, ncclComm_t comm, gpuStream_t stream, const phi::GPUContext &dev_ctx, + distributed::NCCLCommContext *comm_ctx, const T *scale = nullptr) { - NCCLSumWithScaleBase( - sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); + NCCLSumWithScaleBase(sendbuff, + recvbuff, + recvcount, + nranks, + comm, + stream, + dev_ctx, + comm_ctx, + scale); } #endif @@ -1643,26 +1759,71 @@ void DistributedFusedLambKernel( int64_t global_rank = 0, local_rank = 0; ncclComm_t global_comm = nullptr, local_comm = nullptr, external_comm = nullptr; - if (nranks > 1) { - auto *nccl_comm_handle = - paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place); - global_comm = nccl_comm_handle->comm(); - global_rank = nccl_comm_handle->rank(); + paddle::platform::NCCLComm *nccl_comm_handle = nullptr, + *local_nccl_comm_handle = nullptr; + distributed::NCCLCommContext *comm_ctx = nullptr, *local_comm_ctx = nullptr, + *external_comm_ctx = nullptr; + + const auto &comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + + if (FLAGS_dynamic_static_unified_comm) { + CheckCommContextHasRingId(comm_context_manager, ring_ids[0]); + + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_ids[0]))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + paddle::platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + + global_comm = comm_ctx->GetNcclComm(); + global_rank = comm_ctx->GetRank(); if (local_shard) { - auto *local_nccl_comm_handle = - paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1], place); - local_comm = local_nccl_comm_handle->comm(); - local_rank = local_nccl_comm_handle->rank(); + CheckCommContextHasRingId(comm_context_manager, ring_ids[1]); + + local_comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_ids[1]))); + local_comm = local_comm_ctx->GetNcclComm(); + local_rank = local_comm_ctx->GetRank(); if (use_hierarchical_allreduce) { - external_comm = paddle::platform::NCCLCommContext::Instance() - .Get(ring_ids[2], place) - ->comm(); + CheckCommContextHasRingId(comm_context_manager, ring_ids[2]); + + external_comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_ids[2]))); + external_comm = external_comm_ctx->GetNcclComm(); } } else { local_comm = global_comm; local_rank = global_rank; } + + VLOG(3) << "new comm_context_manager has ring_id " << ring_ids[0]; + } else { + if (nranks > 1) { + nccl_comm_handle = + paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place); + global_comm = nccl_comm_handle->comm(); + global_rank = nccl_comm_handle->rank(); + if (local_shard) { + local_nccl_comm_handle = + paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1], + place); + local_comm = local_nccl_comm_handle->comm(); + local_rank = local_nccl_comm_handle->rank(); + if (use_hierarchical_allreduce) { + external_comm = paddle::platform::NCCLCommContext::Instance() + .Get(ring_ids[2], place) + ->comm(); + } + } else { + local_comm = global_comm; + local_rank = global_rank; + } + } } + memory_utils::Buffer grad_norm_square_buffer(place); auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc(2); memory_utils::Buffer cub_tmp_buffer(place); @@ -1715,7 +1876,8 @@ void DistributedFusedLambKernel( num_devices, local_comm, stream, - dev_ctx); + dev_ctx, + local_comm_ctx); NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, @@ -1723,7 +1885,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); + dev_ctx, + external_comm_ctx); NCCLReduceScatterWithScale( fp16_grad_data, @@ -1732,7 +1895,8 @@ void DistributedFusedLambKernel( num_devices, local_comm, stream, - dev_ctx); + dev_ctx, + local_comm_ctx); NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, @@ -1740,7 +1904,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); + dev_ctx, + external_comm_ctx); } else { NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, @@ -1748,14 +1913,16 @@ void DistributedFusedLambKernel( nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); } fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); @@ -1766,14 +1933,16 @@ void DistributedFusedLambKernel( nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); } // (2) Calculate the global grad norm GetSquareGradNorm(fp32_sum_grad, @@ -1786,6 +1955,8 @@ void DistributedFusedLambKernel( VLOG(1) << "Grad square norm before all reduce: " << FlattenToString(fp32_square_grad_norm, 1, place); if (num_devices > 1) { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, @@ -1852,6 +2023,7 @@ void DistributedFusedLambKernel( local_comm, stream, dev_ctx, + local_comm_ctx, fp32_scale); NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, @@ -1860,8 +2032,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); - + dev_ctx, + external_comm_ctx); NCCLReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, @@ -1870,6 +2042,7 @@ void DistributedFusedLambKernel( local_comm, stream, dev_ctx, + local_comm_ctx, fp16_scale); NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, @@ -1878,7 +2051,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); + dev_ctx, + external_comm_ctx); } else { NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, @@ -1887,6 +2061,7 @@ void DistributedFusedLambKernel( global_comm, stream, dev_ctx, + comm_ctx, fp32_scale); NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, @@ -1895,6 +2070,7 @@ void DistributedFusedLambKernel( global_comm, stream, dev_ctx, + comm_ctx, fp16_scale); } fp32_sum_grad += (local_rank * fp32_numel_each_device); @@ -1907,6 +2083,7 @@ void DistributedFusedLambKernel( global_comm, stream, dev_ctx, + comm_ctx, fp32_scale); NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, @@ -1915,6 +2092,7 @@ void DistributedFusedLambKernel( global_comm, stream, dev_ctx, + comm_ctx, fp16_scale); } VLOG(1) << "FP32 HasNanInf after all reduce: " @@ -1929,6 +2107,8 @@ void DistributedFusedLambKernel( stream, &cub_tmp_buffer); if (num_devices > 1) { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, @@ -1954,7 +2134,8 @@ void DistributedFusedLambKernel( num_devices, local_comm, stream, - dev_ctx); + dev_ctx, + local_comm_ctx); NCCLAllReduceWithScale( fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_sum_grad + local_rank * fp32_numel_each_device, @@ -1962,7 +2143,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); + dev_ctx, + external_comm_ctx); NCCLReduceScatterWithScale( fp16_grad_data, fp16_sum_grad + local_rank * fp16_numel_each_device, @@ -1970,7 +2152,8 @@ void DistributedFusedLambKernel( num_devices, local_comm, stream, - dev_ctx); + dev_ctx, + local_comm_ctx); NCCLAllReduceWithScale( fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_sum_grad + local_rank * fp16_numel_each_device, @@ -1978,7 +2161,8 @@ void DistributedFusedLambKernel( nranks / num_devices, external_comm, stream, - dev_ctx); + dev_ctx, + external_comm_ctx); } else { NCCLAllReduceWithScale(fp32_grad_data, fp32_sum_grad, @@ -1986,14 +2170,16 @@ void DistributedFusedLambKernel( nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); NCCLAllReduceWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel, nranks, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); } fp32_sum_grad += (local_rank * fp32_numel_each_device); fp16_sum_grad += (local_rank * fp16_numel_each_device); @@ -2004,14 +2190,16 @@ void DistributedFusedLambKernel( num_devices, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); NCCLReduceScatterWithScale(fp16_grad_data, fp16_sum_grad, fp16_numel_each_device, num_devices, global_comm, stream, - dev_ctx); + dev_ctx, + comm_ctx); } CheckHasNanInfGrad(fp32_sum_grad, fp32_numel_each_device, @@ -2021,6 +2209,8 @@ void DistributedFusedLambKernel( stream, &cub_tmp_buffer); if (num_devices > 1) { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but fp32_square_grad_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(fp32_square_grad_norm, fp32_square_grad_norm, @@ -2165,6 +2355,8 @@ void DistributedFusedLambKernel( << FlattenToString(trust_ratio_div_square_norm, param_num, place); if (num_devices > 1) { if (use_master_param_norm) { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but param_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num, param_square_norm + fp32_global_param_num, @@ -2174,6 +2366,8 @@ void DistributedFusedLambKernel( local_comm, stream)); } else { + // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor, + // but trust_ratio_div_square_norm is allocated by Buffer. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(trust_ratio_div_square_norm, trust_ratio_div_square_norm, @@ -2209,13 +2403,21 @@ void DistributedFusedLambKernel( beta2); if (num_devices > 1) { // ncclAllGather - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllGather(fp32_param_data + fp32_offset, - fp32_param_data, - fp32_numel_each_device, - ncclFloat32, - local_comm, - stream)); + if (local_comm_ctx) { + auto send_buf = paddle::distributed::GetPartialTensor( + *fp32_param_out, fp32_offset, fp32_numel_each_device); + auto recv_buf = paddle::distributed::GetPartialTensor( + *fp32_param_out, 0, fp32_numel_each_device); + local_comm_ctx->AllGather(&recv_buf, send_buf, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllGather(fp32_param_data + fp32_offset, + fp32_param_data, + fp32_numel_each_device, + ncclFloat32, + local_comm, + stream)); + } } beta1_pow_data = nullptr; @@ -2239,13 +2441,21 @@ void DistributedFusedLambKernel( beta2); if (num_devices > 1) { // ncclAllGather - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllGather(fp16_param_data + fp16_offset, - fp16_param_data, - fp16_numel_each_device, - ncclFloat16, - local_comm, - stream)); + if (local_comm_ctx) { + auto send_buf = paddle::distributed::GetPartialTensor( + *fp16_param_out, fp16_offset, fp16_numel_each_device); + auto recv_buf = paddle::distributed::GetPartialTensor( + *fp16_param_out, 0, fp16_numel_each_device); + local_comm_ctx->AllGather(&recv_buf, send_buf, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllGather(fp16_param_data + fp16_offset, + fp16_param_data, + fp16_numel_each_device, + ncclFloat16, + local_comm, + stream)); + } } } VLOG(10) << "Update Param done"; diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc index 90b6a4c447c92..bd49f0cff1708 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.cc +++ b/paddle/phi/core/distributed/nccl_comm_context.cc @@ -33,8 +33,11 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id) : CommContext(rank, size) { PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&nccl_version_)); } +int NCCLCommContext::GetNcclVersion() { return nccl_version_; } + ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; } gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); } @@ -228,5 +231,19 @@ void NCCLCommContext::GroupStart() { } void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); } +#if NCCL_VERSION_CODE >= 21100 +void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op, + void* scalar, + ncclDataType_t dtype, + ncclScalarResidence_t residence) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum( + op, scalar, dtype, residence, nccl_comm_)); +} + +void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_)); +} +#endif + } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h index fdd45793a6387..b9fdce02f4b5f 100644 --- a/paddle/phi/core/distributed/nccl_comm_context.h +++ b/paddle/phi/core/distributed/nccl_comm_context.h @@ -40,7 +40,9 @@ namespace distributed { class NCCLCommContext final : public CommContext { public: NCCLCommContext(int rank, int size, ncclUniqueId nccl_id); - ~NCCLCommContext() {} + ~NCCLCommContext() override = default; + + int GetNcclVersion(); ncclComm_t GetNcclComm(); @@ -65,6 +67,7 @@ class NCCLCommContext final : public CommContext { const phi::DenseTensor& in_tensor, int root, gpuStream_t stream); + void Send(const phi::DenseTensor& in_tensor, const int64_t& count, const int& peer, @@ -99,9 +102,24 @@ class NCCLCommContext final : public CommContext { void GroupEnd(); +#if NCCL_VERSION_CODE >= 21100 + // Creates a new reduction operator which pre-multiplies input values by a + // given scalar locally before reducing them with peer values via summation. + void RedOpCreatePreMulSum(ncclRedOp_t* op, + void* scalar, + ncclDataType_t dtype, + ncclScalarResidence_t residence); + + // Destroys the reduction operator op. The operator must have been created by + // ncclRedOpCreatePreMul with the matching communicator comm. + void RedOpDestroy(ncclRedOp_t op); +#endif + private: DISABLE_COPY_AND_ASSIGN(NCCLCommContext); + int nccl_version_; + ncclComm_t nccl_comm_; std::unique_ptr dev_ctx_; diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 9e7adef0a634f..e6a060c7369a9 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1013,11 +1013,11 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120) set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT - 120) + 240) set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120) set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge - PROPERTIES TIMEOUT 120) + PROPERTIES TIMEOUT 240) set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) diff --git a/test/legacy_test/distributed_fused_lamb_test_base.py b/test/legacy_test/distributed_fused_lamb_test_base.py index baffc7dd5e546..ea011becc9090 100644 --- a/test/legacy_test/distributed_fused_lamb_test_base.py +++ b/test/legacy_test/distributed_fused_lamb_test_base.py @@ -270,7 +270,10 @@ def setUpClass(cls): paddle.enable_static() paddle.set_flags({'FLAGS_cudnn_deterministic': True}) _clip_by_global_norm_using_mp_type(True) - fleet.init(role_maker=get_role_maker()) + if os.environ.get("FLAGS_dynamic_static_unified_comm") == "1": + paddle.distributed.collective._init_parallel_env("nccl") + else: + fleet.init(role_maker=get_role_maker()) def config(self): clip_after_allreduce = bool( diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py index 671e11e7702fe..32ee6fd8b3958 100644 --- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py +++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py @@ -41,6 +41,7 @@ def run_test( max_global_norm=-1.0, gradient_merge_steps=1, use_master_acc_grad=True, + need_env={}, ): temp_dir = tempfile.TemporaryDirectory() if not paddle.is_compiled_with_cuda(): @@ -54,6 +55,8 @@ def run_test( '-u', '-m', 'paddle.distributed.launch', + '--devices', + '0,1', '--log_dir', log_dir, get_test_file(), @@ -65,6 +68,7 @@ def run_test( os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps) os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0) + os.environ.update(need_env) touch_file_env = 'SUCCESS_TOUCH_FILE' touch_file_name = os.path.join( @@ -87,6 +91,20 @@ def test_1(self): def test_2(self): run_test(clip_after_allreduce=False, max_global_norm=0.01) + def test_1_new_comm(self): + run_test( + clip_after_allreduce=True, + max_global_norm=0.01, + need_env={"FLAGS_dynamic_static_unified_comm": "1"}, + ) + + def test_2_new_comm(self): + run_test( + clip_after_allreduce=False, + max_global_norm=0.01, + need_env={"FLAGS_dynamic_static_unified_comm": "1"}, + ) + if __name__ == '__main__': unittest.main() diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py index 0c7096f5dae1a..f236be3a8d150 100644 --- a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py +++ b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py @@ -33,6 +33,23 @@ def test_gm_with_fp16_acc_grad(self): use_master_acc_grad=False, ) + def test_gm_new_comm(self): + run_test( + clip_after_allreduce=True, + max_global_norm=-1.0, + gradient_merge_steps=2, + need_env={"FLAGS_dynamic_static_unified_comm": "1"}, + ) + + def test_gm_with_fp16_acc_grad_new_comm(self): + run_test( + clip_after_allreduce=True, + max_global_norm=-1.0, + gradient_merge_steps=2, + use_master_acc_grad=False, + need_env={"FLAGS_dynamic_static_unified_comm": "1"}, + ) + if __name__ == "__main__": unittest.main() From 892dee35a525f1c752f2cbeff1a72df38b569155 Mon Sep 17 00:00:00 2001 From: iLeGend <824040212@qq.com> Date: Thu, 21 Sep 2023 10:56:43 +0800 Subject: [PATCH 15/39] [NewComm] No.2 compatiable upgrade for partial_recv op (#57548) * [NewComm] No.2 compatiable upgrade for partial_recv op * fix * add header * fix typo --- .../collective/partial_recv_op.cu.cc | 91 +++++++++++++++---- 1 file changed, 74 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index 0c33ca7c25c32..2a6aea1c7a13a 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -18,15 +18,21 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif +#include "paddle/fluid/distributed/collective/utils.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" + namespace paddle { namespace operators { template class PartialRecvOpCUDAKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext &ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ NCCL_VERSION_CODE >= 2703 auto out = ctx.Output("Out"); @@ -74,35 +80,86 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { auto map = distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(rid)) { // Use ProcessGroup - distributed::ProcessGroup *pg = map->get(rid); + distributed::ProcessGroup* pg = map->get(rid); auto task = pg->Recv(out, peer, offset, recv_numel, /*sync_op*/ true); task->Wait(); } else { gpuStream_t stream = nullptr; - auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + platform::NCCLComm* comm = nullptr; + phi::distributed::NCCLCommContext* comm_ctx = nullptr; + + int nranks = 0; + int rank = 0; + + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + + if (FLAGS_dynamic_static_unified_comm) { + // Use New Communication Library + PADDLE_ENFORCE_EQ( + comm_context_manager.Has(std::to_string(rid)), + true, + platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE( + comm_ctx, + nullptr, + platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + + stream = comm_ctx->GetStream(); + nranks = comm_ctx->GetSize(); + rank = comm_ctx->GetRank(); + + VLOG(3) << "new comm_context_manager has ring_id " << rid; + } else { + comm = platform::NCCLCommContext::Instance().Get(rid, place); + + stream = comm->stream(); + nranks = comm->nranks(); + rank = comm->rank(); + + VLOG(3) << "old NCCLCommContext has ring_id" << rid; + } + if (ctx.Attr("use_calc_stream")) { // should ExecutionContext for calc stream. stream = ctx.cuda_device_context().stream(); - } else { - stream = comm->stream(); } + PADDLE_ENFORCE_LT(peer, - comm->nranks(), + nranks, platform::errors::InvalidArgument( "The value of peer (%d) you set must " - "be less than comm->nranks (%d).", + "be less than nranks (%d).", peer, - comm->nranks())); + nranks)); + ncclDataType_t dtype = platform::ToNCCLDataType(type); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclRecv(out->data() + offset, - recv_numel, - dtype, - peer, - comm->comm(), - stream)); - VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel - << " from offset[" << offset << "] from " << peer; + + if (comm_ctx) { + auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel); + + comm_ctx->Recv(&recv_buf, recv_numel, peer, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclRecv(out->data() + offset, + recv_numel, + dtype, + peer, + comm->comm(), + stream)); + } + VLOG(3) << "rank " << rank << " recv " << recv_numel << " from offset[" + << offset << "] from " << peer; } #else PADDLE_THROW(platform::errors::Unavailable( From 431a791a2c7626dcc669efba9bd77a880c625123 Mon Sep 17 00:00:00 2001 From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:56:53 +0800 Subject: [PATCH 16/39] Enhanced tuple support I (#57469) * bugs_fix:tuple_support * bug_fixes * bug_fixes * bug_fixes * bug_fixes * bug_fixes * bug_fixes --- python/paddle/nn/functional/common.py | 30 +++++---- python/paddle/nn/layer/common.py | 8 +-- python/paddle/vision/ops.py | 26 +++++--- test/legacy_test/test_box_coder_op.py | 92 +++++++++++++++++++++------ test/legacy_test/test_min_op.py | 9 +++ test/legacy_test/test_unfold_op.py | 11 ++++ 6 files changed, 133 insertions(+), 43 deletions(-) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 5ef8e40d921b6..9b1da0dd36802 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -69,19 +69,19 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None): Parameters: x(Tensor): 4-D Tensor, input tensor of format [N, C, H, W], data type can be float32 or float64 - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list, optional): The strides, should be [stride_h, stride_w] + strides(int|list|tuple, optional): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list, optional): The paddings of each dimension, should be + paddings(int|list|tuple, optional): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0] - dilations(int|list, optional): the dilations of convolution kernel, should be + dilations(int|list|tuple, optional): the dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. @@ -116,38 +116,42 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None): if isinstance(kernel_sizes, int): kernel_sizes = [kernel_sizes, kernel_sizes] else: - assert isinstance(kernel_sizes, list) and ( + assert isinstance(kernel_sizes, (list, tuple)) and ( len(kernel_sizes) == 2 - ), "kernel_sizes should either be an integer or a list of two integers" + ), "kernel_sizes should either be an integer or a list/tuple of two integers" + kernel_sizes = list(kernel_sizes) if isinstance(strides, int): strides = [strides, strides] else: - assert isinstance(strides, list) and ( + assert isinstance(strides, (list, tuple)) and ( len(strides) == 2 - ), "strides should either be an integer or a list of two integers" + ), "strides should either be an integer or a list/tuple of two integers" + strides = list(strides) if isinstance(dilations, int): dilations = [dilations, dilations] else: - assert isinstance(dilations, list) and ( + assert isinstance(dilations, (list, tuple)) and ( len(dilations) == 2 - ), "dilations should either be an integer or a list of two integers" + ), "dilations should either be an integer or a list/tuple of two integers" + dilations = list(dilations) if isinstance(paddings, int): paddings = [paddings] * 4 - elif isinstance(paddings, list): + elif isinstance(paddings, (list, tuple)): + paddings = list(paddings) if len(paddings) == 2: paddings = paddings * 2 elif len(paddings) == 4: pass else: raise ValueError( - "paddings should either be an integer or a list of 2 or 4 integers" + "paddings should either be an integer or a list/tuple of 2 or 4 integers" ) else: raise ValueError( - "Unexpected type of paddings, it should be either an integer or a list" + "Unexpected type of paddings, it should be either an integer or a list/tuple" "of 2 or 4 integers" ) diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index db11591db5fe7..0c55895d21253 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1551,17 +1551,17 @@ class Unfold(Layer): Parameters: - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list, optional): The strides, should be [stride_h, stride_w] + strides(int|list|tuple, optional): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list, optional): The paddings of each dimension, should be + paddings(int|list|tuple, optional): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0]. - dilations(int|list, optional): The dilations of convolution kernel, should be + dilations(int|list|tuple, optional): The dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. Normally there is no need for user to diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 677fd7602bcfa..d38f81a57ede9 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -616,10 +616,10 @@ def box_coder( left top coordinate of the anchor box, if the input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var (List|Tensor|None): prior_box_var supports three types + prior_box_var (Tensor|List|tuple|None): prior_box_var supports four types of input. One is Tensor with shape [M, 4] which holds M group and - data type is float32 or float64. The second is list consist of - 4 elements shared by all boxes and data type is float32 or float64. + data type is float32 or float64. The second is list or tuple consist + of 4 elements shared by all boxes and data type is float32 or float64. Other is None and not involved in calculation. target_box (Tensor): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can @@ -685,7 +685,11 @@ def box_coder( axis, [], ) - elif isinstance(prior_box_var, list): + elif isinstance(prior_box_var, (list, tuple)): + prior_box_var = list(prior_box_var) + assert ( + len(prior_box_var) == 4 + ), "Input prior_box_var must be Variable or list|tuple with 4 elements." output_box = _C_ops.box_coder( prior_box, None, @@ -696,9 +700,10 @@ def box_coder( prior_box_var, ) else: - raise TypeError("Input prior_box_var must be Variable or list") + raise TypeError( + "Input prior_box_var must be Variable or list|tuple" + ) return output_box - else: check_variable_and_dtype( prior_box, 'prior_box', ['float32', 'float64'], 'box_coder' @@ -720,10 +725,15 @@ def box_coder( } if isinstance(prior_box_var, Variable): inputs['PriorBoxVar'] = prior_box_var - elif isinstance(prior_box_var, list): + elif isinstance(prior_box_var, (list, tuple)): attrs['variance'] = prior_box_var + assert ( + len(attrs['variance']) == 4 + ), "Input prior_box_var must be Variable or list|tuple with 4 elements." else: - raise TypeError("Input prior_box_var must be Variable or list") + raise TypeError( + "Input prior_box_var must be Variable or list|tuple" + ) helper.append_op( type="box_coder", inputs=inputs, diff --git a/test/legacy_test/test_box_coder_op.py b/test/legacy_test/test_box_coder_op.py index 7221fb2ba73f6..72ef401aa5fb7 100644 --- a/test/legacy_test/test_box_coder_op.py +++ b/test/legacy_test/test_box_coder_op.py @@ -372,27 +372,30 @@ def setUp(self): def test_dygraph_with_static(self): paddle.enable_static() - prior_box = paddle.static.data( - name='prior_box', shape=[80, 4], dtype='float32' - ) - prior_box_var = paddle.static.data( - name='prior_box_var', shape=[80, 4], dtype='float32' - ) - target_box = paddle.static.data( - name='target_box', shape=[20, 80, 4], dtype='float32' - ) + exe = paddle.static.Executor() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + prior_box = paddle.static.data( + name='prior_box', shape=[80, 4], dtype='float32' + ) + prior_box_var = paddle.static.data( + name='prior_box_var', shape=[80, 4], dtype='float32' + ) + target_box = paddle.static.data( + name='target_box', shape=[20, 80, 4], dtype='float32' + ) - boxes = paddle.vision.ops.box_coder( - prior_box=prior_box, - prior_box_var=prior_box_var, - target_box=target_box, - code_type="decode_center_size", - box_normalized=False, - ) + boxes = paddle.vision.ops.box_coder( + prior_box=prior_box, + prior_box_var=prior_box_var, + target_box=target_box, + code_type="decode_center_size", + box_normalized=False, + ) - exe = paddle.static.Executor() boxes_np = exe.run( - paddle.static.default_main_program(), + main, feed={ 'prior_box': self.prior_box_np, 'prior_box_var': self.prior_box_var_np, @@ -419,6 +422,59 @@ def test_dygraph_with_static(self): paddle.enable_static() +class TestBoxCoderSupporttuple(unittest.TestCase): + def setUp(self): + np.random.seed(678) + self.prior_box_np = np.random.random((80, 4)).astype('float32') + self.target_box_np = np.random.random((20, 80, 4)).astype('float32') + + def test_support_tuple(self): + paddle.enable_static() + exe = paddle.static.Executor() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + prior_box = paddle.static.data( + name='prior_box', shape=[80, 4], dtype='float32' + ) + target_box = paddle.static.data( + name='target_box', shape=[20, 80, 4], dtype='float32' + ) + + boxes = paddle.vision.ops.box_coder( + prior_box=prior_box, + prior_box_var=(1, 2, 3, 4), + target_box=target_box, + code_type="decode_center_size", + box_normalized=False, + ) + + boxes_np = exe.run( + main, + feed={ + 'prior_box': self.prior_box_np, + 'target_box': self.target_box_np, + }, + fetch_list=[boxes], + )[0] + + paddle.disable_static() + prior_box_dy = paddle.to_tensor(self.prior_box_np) + target_box_dy = paddle.to_tensor(self.target_box_np) + + boxes_dy = paddle.vision.ops.box_coder( + prior_box=prior_box_dy, + prior_box_var=(1, 2, 3, 4), + target_box=target_box_dy, + code_type="decode_center_size", + box_normalized=False, + ) + boxes_dy_np = boxes_dy.numpy() + + np.testing.assert_allclose(boxes_np, boxes_dy_np) + paddle.enable_static() + + if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py index 7de7108d7d1ad..e24471b20dca8 100644 --- a/test/legacy_test/test_min_op.py +++ b/test/legacy_test/test_min_op.py @@ -83,6 +83,15 @@ def test_imperative_api(self): z_expected = np.array(np.min(np_x, axis=0)) self.assertEqual((np_z == z_expected).all(), True) + def test_support_tuple(self): + paddle.disable_static() + np_x = np.array([10, 10]).astype('float64') + x = paddle.to_tensor(np_x) + z = paddle.min(x, axis=(0,)) + np_z = z.numpy() + z_expected = np.array(np.min(np_x, axis=0)) + self.assertEqual((np_z == z_expected).all(), True) + class TestOutDtype(unittest.TestCase): def test_min(self): diff --git a/test/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py index 8a7f2aaf199f3..ef8174256e5cb 100644 --- a/test/legacy_test/test_unfold_op.py +++ b/test/legacy_test/test_unfold_op.py @@ -144,6 +144,17 @@ def test_check_output(self): def test_check_grad(self): self.check_grad(['X'], 'Y') + def test_support_tuple(self): + paddle.disable_static() + x = paddle.randn((10, 3, 64, 64)) + paddle.nn.functional.unfold(x, 3, (1, 1), 1, 1) + paddle.nn.functional.unfold(x, 3, 1, (1, 1), 1) + paddle.nn.functional.unfold(x, 3, 1, 1, (1, 1)) + out1 = paddle.nn.functional.unfold(x, 3, (1, 1), (1, 1), (1, 1)) + out2 = paddle.nn.functional.unfold(x, (3, 3), (1, 1), (1, 1), (1, 1)) + self.assertTrue(np.allclose(out1.numpy(), out2.numpy())) + paddle.enable_static() + class TestUnfoldFP16Op(TestUnfoldOp): def init_dtype(self): From 20893b0b10df7602c597fcfc920eaec015701860 Mon Sep 17 00:00:00 2001 From: Ligoml <39876205+Ligoml@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:57:05 +0800 Subject: [PATCH 17/39] Update CI api_docs_approval (#57542) * Don't Merge * make conflict * reset * updata check_api_approvals.sh --- tools/check_api_approvals.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 3989a0cceff1b..5f05b3cf6f080 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -43,22 +43,18 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n" echo_line="${echo_line} and one TPM approval for API change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n" - echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n" - echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general APIs.\n" check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01 - check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1 + check_approval 1 jzhang533 sunzhongkai588 Ligoml fi api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` if [ "$api_doc_spec_diff" != "" ]; then echo_line="You must have one TPM approval for API documents change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general API docs.\n" - echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n" - echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general API docs.\n" - check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1 + check_approval 1 jzhang533 sunzhongkai588 Ligoml fi api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}` From 69fa09a223fbcbd668099d425655f141dc5c1883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 21 Sep 2023 10:59:19 +0800 Subject: [PATCH 18/39] add API for ir_compare and move it from namespace optim to ir_utils (#57531) --- .../auto_schedule/search_space/search_state.cc | 7 +++---- .../auto_schedule/search_space/search_state.h | 4 ++-- paddle/cinn/ir/test/ir_compare_test.cc | 18 +++++++----------- paddle/cinn/ir/utils/ir_compare.cc | 8 ++++++++ paddle/cinn/ir/utils/ir_compare.h | 6 ++++++ paddle/cinn/ir/utils/ir_visitor.cc | 3 +-- 6 files changed, 27 insertions(+), 19 deletions(-) diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc index 96ace0f505d7f..c16bf62840291 100644 --- a/paddle/cinn/auto_schedule/search_space/search_state.cc +++ b/paddle/cinn/auto_schedule/search_space/search_state.cc @@ -133,11 +133,10 @@ bool SearchStateEqual::operator()(const SearchState& lhs, // compare exprs size firstly if (lhs_exprs.size() != rhs_exprs.size()) return false; - // compare every expr one by one with ir::IrEqualVisitor + // compare every expr one by one with ir::ir_utils::IrEqualVisitor for (int i = 0; i < lhs_exprs.size(); ++i) { - ir::IrEqualVisitor compartor( - /*allow_name_suffix_diff=*/true); // ignore suffix difference in name - if (!compartor.Compare(lhs_exprs[i], rhs_exprs[i])) return false; + if (!ir::ir_utils::IRCompare(lhs_exprs[i], rhs_exprs[i], true)) + return false; } return true; } diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h index 7991fb9540188..b3f45c5cd746c 100644 --- a/paddle/cinn/auto_schedule/search_space/search_state.h +++ b/paddle/cinn/auto_schedule/search_space/search_state.h @@ -70,8 +70,8 @@ struct SearchStateHash { size_t operator()(const SearchState& s) const; }; -// SearchStateHash equal functor, use ir::IrEqualVisitor to compare their AST -// struct and fields +// SearchStateHash equal functor, use ir::ir_utils::IrEqualVisitor to compare +// their AST struct and fields struct SearchStateEqual { bool operator()(const SearchState& lhs, const SearchState& rhs) const; }; diff --git a/paddle/cinn/ir/test/ir_compare_test.cc b/paddle/cinn/ir/test/ir_compare_test.cc index a1bca0cd5373f..cc9ce438221a2 100644 --- a/paddle/cinn/ir/test/ir_compare_test.cc +++ b/paddle/cinn/ir/test/ir_compare_test.cc @@ -23,7 +23,7 @@ namespace cinn { namespace ir { - +namespace ir_utils { TEST(TestIrCompare, SingleFunction) { Target target = common::DefaultHostTarget(); @@ -128,20 +128,16 @@ TEST(TestIrCompare, SingleFunction) { ASSERT_EQ(func2_str, utils::GetStreamCnt(funcs_2.front())); ASSERT_EQ(func3_str, utils::GetStreamCnt(funcs_3.front())); - IrEqualVisitor compartor; // they are different at the name of root ScheduleBlock - ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_2.front())); + ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front())); // compare with itself - ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_1.front())); - IrEqualVisitor compartor_allow_suffix_diff(true); + ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_1.front())); // they are euqal if allowing suffix of name different - ASSERT_TRUE( - compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_2.front())); + ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front(), true)); - ASSERT_FALSE(compartor.Compare(funcs_1.front(), funcs_3.front())); - ASSERT_FALSE( - compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_3.front())); + ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front())); + ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front(), true)); } - +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_compare.cc b/paddle/cinn/ir/utils/ir_compare.cc index c303262d04fbd..87324be608048 100644 --- a/paddle/cinn/ir/utils/ir_compare.cc +++ b/paddle/cinn/ir/utils/ir_compare.cc @@ -22,6 +22,8 @@ namespace cinn { namespace ir { +namespace ir_utils { + bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) { if (lhs.get() == rhs.get()) { // the same object, including both are null return true; @@ -358,5 +360,11 @@ bool IrEqualVisitor::Visit(const ScheduleBlockRealize* lhs, const Expr* other) { Compare(lhs->schedule_block, rhs->schedule_block); } +bool IRCompare(const Expr& lhs, const Expr& rhs, bool allow_name_suffix_diff) { + IrEqualVisitor ir_equal_visitor(allow_name_suffix_diff); + return ir_equal_visitor.Compare(lhs, rhs); +} + +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_compare.h b/paddle/cinn/ir/utils/ir_compare.h index 9e4b335857b98..d41e6db0441a7 100644 --- a/paddle/cinn/ir/utils/ir_compare.h +++ b/paddle/cinn/ir/utils/ir_compare.h @@ -20,6 +20,7 @@ namespace cinn { namespace ir { +namespace ir_utils { // Determine whether two ir AST trees are euqal by comparing their struct and // fields of each node through dfs visitor @@ -47,5 +48,10 @@ class IrEqualVisitor : public IRVisitorRequireReImpl { bool allow_name_suffix_diff_ = false; }; +bool IRCompare(const Expr& lhs, + const Expr& rhs, + bool allow_name_suffix_diff = false); + +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_visitor.cc b/paddle/cinn/ir/utils/ir_visitor.cc index 9ef6a78df1fcd..f55259be2c641 100644 --- a/paddle/cinn/ir/utils/ir_visitor.cc +++ b/paddle/cinn/ir/utils/ir_visitor.cc @@ -23,8 +23,7 @@ namespace ir { bool operator==(Expr a, Expr b) { if (a.get() == b.get()) return true; - IrEqualVisitor cmp; - return cmp.Compare(a, b); + return ir_utils::IRCompare(a, b); } bool operator!=(Expr a, Expr b) { return !(a == b); } From b718b1be52e67f72974de7db42fc0fecf070ac18 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:00:31 +0800 Subject: [PATCH 19/39] pir support pixel unshuffle op (#57521) --- paddle/fluid/operators/pixel_unshuffle_op.cc | 105 ------------------- paddle/phi/api/yaml/backward.yaml | 9 ++ paddle/phi/api/yaml/op_compat.yaml | 7 ++ paddle/phi/api/yaml/ops.yaml | 9 ++ paddle/phi/ops/compat/pixel_unshuffle_sig.cc | 30 ------ test/legacy_test/test_pixel_unshuffle.py | 4 +- 6 files changed, 27 insertions(+), 137 deletions(-) delete mode 100644 paddle/fluid/operators/pixel_unshuffle_op.cc delete mode 100644 paddle/phi/ops/compat/pixel_unshuffle_sig.cc diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc deleted file mode 100644 index 52b7452d7a8cc..0000000000000 --- a/paddle/fluid/operators/pixel_unshuffle_op.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class PixelUnshuffleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor, default Tensor), " - "the input feature data of PixelUnshuffleOp, the layout is " - "[N, C, H, W] or [N, H, W, C]."); - AddOutput("Out", - "(Tensor, default Tensor), the output of " - "PixelUnshuffleOp. The layout is [N, C*factor^2, H/factor, " - "W/factor] or [N, H/factor, W/factor, C*factor^2]."); - AddAttr("downscale_factor", - "the factor to decrease spatial resolution by.") - .SetDefault(1); - AddAttr( - "data_format", - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\", Specify the data format of the input data.") - .SetDefault("NCHW"); - - AddComment(R"DOC( - Pixel Unshuffle operator - This operator rearranges elements in a tensor of shape :math:`(*, C, H, W)` - to a tensor of shape :math:`(*, C\times r^2, H / r, W / r)`. - - This operation is the reversion of PixelShuffle operation. - - Please refer to the paper: - `Real-Time Single Image and Video Super-Resolution Using an Efficient - Sub-Pixel Convolutional Neural Network `_ - by Shi et. al (2016) for more details. - )DOC"); - } -}; - -template -class PixelUnshuffleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("pixel_unshuffle_grad"); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetAttrMap(this->Attrs()); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -class PixelUnshuffleGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle, - PixelUnshuffleInferShapeFunctor, - PD_INFER_META(phi::PixelUnshuffleInferMeta)); - -REGISTER_OPERATOR(pixel_unshuffle, - ops::PixelUnshuffleOp, - ops::PixelUnshuffleOpMaker, - ops::PixelUnshuffleGradOpMaker, - ops::PixelUnshuffleGradOpMaker, - PixelUnshuffleInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle_grad, - PixelUnshuffleGradInferShapeFunctor, - PD_INFER_META(phi::PixelUnshuffleGradInferMeta)); - -REGISTER_OPERATOR(pixel_unshuffle_grad, - ops::PixelUnshuffleGradOp, - PixelUnshuffleGradInferShapeFunctor); diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index b6eeb5e07005c..2f48bb80478e6 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1646,6 +1646,15 @@ kernel : func : pixel_shuffle_grad +- backward_op : pixel_unshuffle_grad + forward : pixel_unshuffle (Tensor x, int downscale_factor=1, str data_format="NCHW") -> Tensor(out) + args : (Tensor out_grad, int downscale_factor, str data_format) + output : Tensor(x_grad) + infer_meta : + func : PixelUnshuffleGradInferMeta + kernel : + func : pixel_unshuffle_grad + - backward_op : poisson_grad forward : poisson (Tensor x) -> Tensor(out) args : (Tensor out_grad) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 31125b8df0ce7..8a85147a66da0 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -2103,6 +2103,13 @@ outputs : out : Out +- op : pixel_unshuffle + backward : pixel_unshuffle_grad + inputs : + x : X + outputs : + out : Out + - op : poisson inputs : x : X diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 4e67144ba8a89..c93f94c2b3320 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1934,6 +1934,15 @@ func : pixel_shuffle backward : pixel_shuffle_grad +- op : pixel_unshuffle + args : (Tensor x, int downscale_factor=1, str data_format="NCHW") + output : Tensor + infer_meta : + func : PixelUnshuffleInferMeta + kernel : + func : pixel_unshuffle + backward : pixel_unshuffle_grad + - op : poisson args : (Tensor x) output : Tensor diff --git a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc b/paddle/phi/ops/compat/pixel_unshuffle_sig.cc deleted file mode 100644 index 6c983c1e24c28..0000000000000 --- a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature PixelUnshuffleGradOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature("pixel_unshuffle_grad", - {"Out@GRAD"}, - {"downscale_factor", "data_format"}, - {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad, - phi::PixelUnshuffleGradOpArgumentMapping); diff --git a/test/legacy_test/test_pixel_unshuffle.py b/test/legacy_test/test_pixel_unshuffle.py index ec6ce803d1277..eb2c287b3f886 100644 --- a/test/legacy_test/test_pixel_unshuffle.py +++ b/test/legacy_test/test_pixel_unshuffle.py @@ -69,8 +69,8 @@ def pixel_unshuffle_np(x, down_factor, data_format="NCHW"): def pixel_unshuffle_wrapper(x, downscale_factor, data_format): - return paddle._legacy_C_ops.pixel_unshuffle( - x, "downscale_factor", downscale_factor, "data_format", data_format + return paddle.nn.functional.pixel_unshuffle( + x, downscale_factor, data_format ) From c4dd10935231f0cf4253225e912a195435dd2d2b Mon Sep 17 00:00:00 2001 From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:07:18 +0800 Subject: [PATCH 20/39] [CodeStyle][task 39] enable isort in `python/paddle/base` (part1) (#57413) * enable isort rule in python/paddle/base * fix bug * fix bug * fix bug --- pyproject.toml | 3 +- python/paddle/base/backward.py | 19 ++++---- python/paddle/base/compiler.py | 19 ++++---- python/paddle/base/data_feed_desc.py | 3 +- python/paddle/base/data_feeder.py | 13 +++--- python/paddle/base/dataset.py | 6 ++- python/paddle/base/default_scope_funcs.py | 3 +- python/paddle/base/dygraph/base.py | 16 ++++--- python/paddle/base/dygraph/math_op_patch.py | 10 ++--- .../base/dygraph/tensor_patch_methods.py | 34 +++++++------- python/paddle/base/dygraph/tracer.py | 3 +- python/paddle/base/dygraph_utils.py | 3 +- python/paddle/base/executor.py | 33 ++++++-------- .../incubate/checkpoint/auto_checkpoint.py | 11 ++--- python/paddle/base/initializer.py | 3 +- python/paddle/base/io.py | 1 + python/paddle/base/layer_helper.py | 11 ++--- python/paddle/base/layer_helper_base.py | 9 ++-- python/paddle/base/layers/io.py | 6 +-- .../base/layers/layer_function_generator.py | 13 +++--- python/paddle/base/layers/math_op_patch.py | 11 +++-- python/paddle/base/lod_tensor.py | 3 +- python/paddle/base/multiprocess_utils.py | 7 ++- python/paddle/base/param_attr.py | 2 +- python/paddle/base/reader.py | 44 +++++++++---------- python/paddle/base/trainer_desc.py | 2 +- python/paddle/base/trainer_factory.py | 26 ++++++----- python/paddle/base/unique_name.py | 3 +- python/paddle/base/variable_index.py | 24 +++++----- python/paddle/base/wrapped_decorator.py | 3 +- 30 files changed, 172 insertions(+), 172 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e11ab2108c2be..8dd98b65873aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,9 @@ skip = ["build", "third_party", "__init__.py"] extend_skip_glob = [ # These files do not need to be formatted, # see .flake8 for more details - "python/paddle/base/**", "python/paddle/utils/gast/**", + "python/paddle/base/core.py", + "python/paddle/base/framework.py", ] [tool.ruff] diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py index 563e423e0c7ea..1f3f67a98b640 100755 --- a/python/paddle/base/backward.py +++ b/python/paddle/base/backward.py @@ -12,23 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .proto import framework_pb2 - -from paddle.base import framework as framework -from paddle.base import program_guard -from . import core import collections import copy import logging -from . import unique_name -from . import log_helper -import paddle.base -from .data_feeder import check_type +import re import warnings - from collections.abc import Sequence -import re +import paddle.base +from paddle.base import framework as framework +from paddle.base import program_guard + +from . import core, log_helper, unique_name +from .data_feeder import check_type +from .proto import framework_pb2 __all__ = [ 'append_backward', diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py index 69ae6f1d31344..3ee939920dc2b 100644 --- a/python/paddle/base/compiler.py +++ b/python/paddle/base/compiler.py @@ -14,9 +14,9 @@ import sys import warnings -from . import framework -from .framework import cuda_places, cpu_places, xpu_places -from . import core + +from . import core, framework +from .framework import cpu_places, cuda_places, xpu_places __all__ = [ 'CompiledProgram', @@ -399,10 +399,11 @@ def convert_concrete_program( """ Convert the ConcreteProgram to IPUConcreteProgram. """ - from ..base.dygraph.base import switch_to_static_graph + import paddle + from ..base import backward + from ..base.dygraph.base import switch_to_static_graph from ..base.framework import device_guard - import paddle inputs = concrete_program.inputs outputs = concrete_program.outputs @@ -508,14 +509,12 @@ def patch_program_cache(ipu_strategy): Returns: None """ + from paddle.jit.dy2static import logging_utils + from paddle.jit.dy2static.partial_program import partial_program_from from paddle.jit.dy2static.program_translator import ( + MAX_TRACED_PROGRAM_COUNT, CacheKey, ProgramCache, - MAX_TRACED_PROGRAM_COUNT, - ) - from paddle.jit.dy2static import logging_utils - from paddle.jit.dy2static.partial_program import ( - partial_program_from, ) old_getter = ProgramCache.__getitem__ diff --git a/python/paddle/base/data_feed_desc.py b/python/paddle/base/data_feed_desc.py index 8aa69890f1933..de1b00d090bb1 100644 --- a/python/paddle/base/data_feed_desc.py +++ b/python/paddle/base/data_feed_desc.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.base.proto import data_feed_pb2 from google.protobuf import text_format +from paddle.base.proto import data_feed_pb2 + __all__ = ['DataFeedDesc'] diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py index 78781a6856af1..52ed983ffa729 100644 --- a/python/paddle/base/data_feeder.py +++ b/python/paddle/base/data_feeder.py @@ -12,20 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import core -import numpy as np -import warnings import struct +import warnings + +import numpy as np +from ..ir import OpResult +from . import core from .framework import ( Variable, + _cpu_num, + _cuda_ids, default_main_program, in_dygraph_mode, in_pir_mode, ) -from .framework import _cpu_num, _cuda_ids - -from ..ir import OpResult __all__ = ['DataFeeder'] diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py index 533fb69a6621b..961a392349707 100644 --- a/python/paddle/base/dataset.py +++ b/python/paddle/base/dataset.py @@ -13,10 +13,12 @@ # limitations under the License. """This is definition of dataset class, which is high performance IO.""" -from paddle.base.proto import data_feed_pb2 from google.protobuf import text_format -from . import core + +from paddle.base.proto import data_feed_pb2 + from ..utils import deprecated +from . import core __all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset'] diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py index 80cfe40db57ad..992714e6cd409 100644 --- a/python/paddle/base/default_scope_funcs.py +++ b/python/paddle/base/default_scope_funcs.py @@ -26,9 +26,10 @@ invoked in a new local scope. """ -import paddle.base.core import threading +import paddle.base.core + __tl_scope__ = threading.local() __all__ = [ diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py index 7edb748026d84..d85fc8ca25bf7 100644 --- a/python/paddle/base/dygraph/base.py +++ b/python/paddle/base/dygraph/base.py @@ -11,20 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator -import decorator import inspect import sys +import warnings + +import decorator import numpy as np -from paddle.base import core -from paddle.base import framework + +import paddle +from paddle.base import core, framework from paddle.base.framework import global_var from paddle.base.multiprocess_utils import CleanupFuncRegistrar -from .tracer import Tracer + from ..data_feeder import convert_dtype -import warnings from ..framework import _get_paddle_place -import paddle +from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator +from .tracer import Tracer __all__ = [ 'no_grad', diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py index 9448d7d9de9dd..5972b545f93e2 100644 --- a/python/paddle/base/dygraph/math_op_patch.py +++ b/python/paddle/base/dygraph/math_op_patch.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .. import core -from ..framework import ( - convert_np_dtype_to_dtype_, -) -from .. import framework - import numpy as np + from paddle import _C_ops, _legacy_C_ops +from .. import core, framework +from ..framework import convert_np_dtype_to_dtype_ + _supported_int_dtype_ = [ core.VarDesc.VarType.UINT8, core.VarDesc.VarType.INT8, diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py index 8026884c34fc8..4f1b138abaae4 100644 --- a/python/paddle/base/dygraph/tensor_patch_methods.py +++ b/python/paddle/base/dygraph/tensor_patch_methods.py @@ -13,33 +13,33 @@ # limitations under the License. import inspect -import numpy as np -import warnings import sys +import warnings + +import numpy as np import paddle -from .. import framework -from ..framework import convert_np_dtype_to_dtype_ -from .. import core -from .. import unique_name +import paddle.profiler as profiler +import paddle.utils.deprecated as deprecated +from paddle import _C_ops +from paddle.base.data_feeder import ( + _PADDLE_DTYPE_2_NUMPY_DTYPE, + convert_uint16_to_float, +) +from paddle.profiler.utils import in_profiler_mode + +from .. import core, framework, unique_name from ..framework import ( - Variable, + EagerParamBase, Parameter, + Variable, _getitem_static, - _setitem_static, _setitem_impl_, - EagerParamBase, + _setitem_static, + convert_np_dtype_to_dtype_, ) from .base import switch_to_static_graph from .math_op_patch import monkey_patch_math_tensor -from paddle.base.data_feeder import ( - convert_uint16_to_float, - _PADDLE_DTYPE_2_NUMPY_DTYPE, -) -import paddle.utils.deprecated as deprecated -import paddle.profiler as profiler -from paddle.profiler.utils import in_profiler_mode -from paddle import _C_ops _grad_scalar = None diff --git a/python/paddle/base/dygraph/tracer.py b/python/paddle/base/dygraph/tracer.py index 35cbe88f91f87..4df9517073c66 100644 --- a/python/paddle/base/dygraph/tracer.py +++ b/python/paddle/base/dygraph/tracer.py @@ -13,9 +13,8 @@ # limitations under the License. -from paddle.base import core -from paddle.base import framework from paddle import _C_ops, _legacy_C_ops +from paddle.base import core, framework name_mapping = { "graph_send_recv": { diff --git a/python/paddle/base/dygraph_utils.py b/python/paddle/base/dygraph_utils.py index 655a5f4f8b773..926c4680017ce 100644 --- a/python/paddle/base/dygraph_utils.py +++ b/python/paddle/base/dygraph_utils.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .framework import dygraph_only from paddle import _legacy_C_ops +from .framework import dygraph_only + @dygraph_only def _append_activation_in_dygraph( diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index e5fddd15329e3..0921d7b79d14b 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -12,36 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import os import sys import warnings -import numpy as np +from functools import lru_cache -from . import set_flags, get_flags -from .framework import Program, default_main_program +import numpy as np from ..ir import OpResult -from .wrapped_decorator import signature_safe_contextmanager +from . import compiler, core, framework, get_flags, set_flags, unique_name from .data_feeder import convert_dtype -from .framework import Variable, Operator, in_pir_mode - from .framework import ( - convert_np_dtype_to_dtype_, + Operator, + Program, + Variable, _apply_pass, + convert_np_dtype_to_dtype_, + default_main_program, + in_pir_mode, paddle_type_to_proto_type, ) - -from . import core -from . import unique_name -from . import compiler -from .trainer_factory import TrainerFactory -from .trainer_factory import FetchHandlerMonitor -import copy -from . import framework from .incubate.checkpoint import auto_checkpoint as acp - -from functools import lru_cache +from .trainer_factory import FetchHandlerMonitor, TrainerFactory +from .wrapped_decorator import signature_safe_contextmanager __all__ = ['Executor', 'global_scope', 'scope_guard'] @@ -614,8 +609,8 @@ def _to_str(var): def _prepare_fleet_executor(): - from ..distributed.fleet.proto import fleet_executor_desc_pb2 from ..distributed.backup_env import getenv_or_backup + from ..distributed.fleet.proto import fleet_executor_desc_pb2 trainer_endpoints_str = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS", "") trainer_endpoints = trainer_endpoints_str.split(',') @@ -945,7 +940,7 @@ def _get_program_and_executor(self, cached_data): # print(f"Program after convert:\n {inner_program}", flush=True) else: build_strategy = None - from paddle.incubate.autograd import prim_enabled, prim2orig + from paddle.incubate.autograd import prim2orig, prim_enabled if prim_enabled() and program == default_main_program(): prim2orig() diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py index 23239b692c975..e8f75f3a4ed55 100644 --- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py +++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import logging import json +import logging import os +import sys import time from threading import current_thread -from paddle.base import unique_name, compiler -from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel -from paddle.base.framework import in_dygraph_mode, Program +from paddle.base import compiler, unique_name +from paddle.base.framework import Program, in_dygraph_mode + +from .checkpoint_saver import CheckpointSaver, PaddleModel, SerializableBase g_train_epoch_range = None g_checker = None diff --git a/python/paddle/base/initializer.py b/python/paddle/base/initializer.py index 3902281721eac..7443e63b13e52 100644 --- a/python/paddle/base/initializer.py +++ b/python/paddle/base/initializer.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .data_feeder import check_type import paddle +from .data_feeder import check_type + __all__ = ['set_global_initializer'] _global_weight_initializer_ = None diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py index 89468f88648e8..a2c7d02ede349 100644 --- a/python/paddle/base/io.py +++ b/python/paddle/base/io.py @@ -15,6 +15,7 @@ import logging from paddle.base.log_helper import get_logger + from . import reader from .reader import * diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py index e6be93e777b75..312eaf67a3320 100644 --- a/python/paddle/base/layer_helper.py +++ b/python/paddle/base/layer_helper.py @@ -13,18 +13,19 @@ # limitations under the License. import copy + import paddle + +from . import unique_name +from .dygraph_utils import _append_activation_in_dygraph from .framework import ( Parameter, + _global_flags, dtype_is_floating, in_dygraph_mode, - _global_flags, ) -from . import unique_name -from .param_attr import ParamAttr - from .layer_helper_base import LayerHelperBase -from .dygraph_utils import _append_activation_in_dygraph +from .param_attr import ParamAttr class LayerHelper(LayerHelperBase): diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py index b7bc6c6b8585e..6c047c08766fe 100644 --- a/python/paddle/base/layer_helper_base.py +++ b/python/paddle/base/layer_helper_base.py @@ -13,21 +13,22 @@ # limitations under the License. import copy + import numpy as np + import paddle +from . import core, unique_name from .framework import ( Variable, + _current_expected_place, default_main_program, default_startup_program, in_dygraph_mode, in_pir_mode, - _current_expected_place, ) -from . import unique_name +from .initializer import _global_bias_initializer, _global_weight_initializer from .param_attr import ParamAttr, WeightNormParamAttr -from . import core -from .initializer import _global_weight_initializer, _global_bias_initializer __all__ = ['LayerHelperBase'] diff --git a/python/paddle/base/layers/io.py b/python/paddle/base/layers/io.py index d4aa7734aee6f..51f5b10fe0618 100644 --- a/python/paddle/base/layers/io.py +++ b/python/paddle/base/layers/io.py @@ -14,13 +14,9 @@ from .. import core from ..executor import global_scope -from ..framework import ( - default_main_program, - default_startup_program, -) +from ..framework import default_main_program, default_startup_program from ..unique_name import generate as unique_name - __all__ = [] diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py index 1b1b85d00ea42..bd11a412ffc5b 100644 --- a/python/paddle/base/layers/layer_function_generator.py +++ b/python/paddle/base/layers/layer_function_generator.py @@ -13,21 +13,22 @@ # limitations under the License. import re -import warnings import string - +import warnings from io import StringIO -from ..proto import framework_pb2 + +from paddle import _C_ops, _legacy_C_ops + +from ..data_feeder import check_variable_and_dtype from ..framework import ( OpProtoHolder, Variable, - core, convert_np_dtype_to_dtype_, + core, in_dygraph_mode, ) from ..layer_helper import LayerHelper -from ..data_feeder import check_variable_and_dtype -from paddle import _C_ops, _legacy_C_ops +from ..proto import framework_pb2 __all__ = [ 'generate_layer_fn', diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py index 06f384eae23d1..53f35939b1f3a 100644 --- a/python/paddle/base/layers/math_op_patch.py +++ b/python/paddle/base/layers/math_op_patch.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings import inspect +import warnings + +from paddle.base.dygraph.base import in_to_static_mode from .. import core -from ..framework import Variable, unique_name, static_only +from ..framework import Variable, static_only, unique_name from .layer_function_generator import OpProtoHolder -from paddle.base.dygraph.base import in_to_static_mode _supported_int_dtype_ = [ core.VarDesc.VarType.BOOL, @@ -354,9 +355,7 @@ def pop(self, *args): Returns: Variable: self[index] """ - from paddle.jit.dy2static.convert_operators import ( - _run_paddle_pop, - ) + from paddle.jit.dy2static.convert_operators import _run_paddle_pop if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: raise TypeError( diff --git a/python/paddle/base/lod_tensor.py b/python/paddle/base/lod_tensor.py index 96e18ec8f3bde..4be41d5cc6adc 100644 --- a/python/paddle/base/lod_tensor.py +++ b/python/paddle/base/lod_tensor.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np + from . import core from .data_feeder import DataToLoDTensorConverter -import numpy as np __all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] diff --git a/python/paddle/base/multiprocess_utils.py b/python/paddle/base/multiprocess_utils.py index b763446930fdb..8d18db0bb3ea8 100644 --- a/python/paddle/base/multiprocess_utils.py +++ b/python/paddle/base/multiprocess_utils.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import signal import atexit +import queue +import signal +import sys from . import core -import queue - # multi-process worker check indices queue interval, avoid # hanging in subprocess data loading MP_STATUS_CHECK_INTERVAL = 5.0 diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py index a17432fcc3df2..674c4ad4328c5 100644 --- a/python/paddle/base/param_attr.py +++ b/python/paddle/base/param_attr.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -from paddle.regularizer import WeightDecayRegularizer from paddle.base.data_feeder import check_type +from paddle.regularizer import WeightDecayRegularizer __all__ = [ 'ParamAttr', diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py index 63b97ee2bd495..c3a65721db275 100644 --- a/python/paddle/base/reader.py +++ b/python/paddle/base/reader.py @@ -12,44 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import core +import logging +import multiprocessing +import queue import sys -import numpy as np import threading +import warnings + +import numpy as np + import paddle +from paddle.base.framework import _set_expected_place +from . import core +from .data_feeder import BatchedTensorProvider, DataFeeder +from .executor import global_scope from .framework import ( Program, - program_guard, + _current_expected_place, + _get_paddle_place, + _get_paddle_place_list, default_main_program, default_startup_program, in_dygraph_mode, - _current_expected_place, + program_guard, ) -from .executor import global_scope -from .data_feeder import DataFeeder, BatchedTensorProvider +from .layers.io import ( + __create_unshared_decorated_reader__, + _copy_reader_var_, + monkey_patch_reader_methods, +) +from .multiprocess_utils import _cleanup # noqa: F401 +from .multiprocess_utils import multiprocess_queue_set # noqa: F401 from .multiprocess_utils import ( - multiprocess_queue_set, # noqa: F401 CleanupFuncRegistrar, _cleanup_mmap, - _cleanup, # noqa: F401 _set_SIGCHLD_handler, ) -from .layers.io import ( - monkey_patch_reader_methods, - _copy_reader_var_, - __create_unshared_decorated_reader__, -) from .unique_name import UniqueNameGenerator -from .framework import _get_paddle_place, _get_paddle_place_list -from paddle.base.framework import _set_expected_place -import logging -import warnings - -### Dygraph DataLoader configs ### -import multiprocessing - -import queue # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process QUEUE_GET_TIMEOUT = 60 diff --git a/python/paddle/base/trainer_desc.py b/python/paddle/base/trainer_desc.py index 48cc427ac8e7e..f64530ec02353 100644 --- a/python/paddle/base/trainer_desc.py +++ b/python/paddle/base/trainer_desc.py @@ -13,8 +13,8 @@ # limitations under the License. """Definition of trainers.""" -import sys import os +import sys __all__ = [ 'TrainerDesc', diff --git a/python/paddle/base/trainer_factory.py b/python/paddle/base/trainer_factory.py index cf197fab524e0..75351872d73d6 100644 --- a/python/paddle/base/trainer_factory.py +++ b/python/paddle/base/trainer_factory.py @@ -13,33 +13,35 @@ # limitations under the License. """Definition of TrainerFactory.""" +import logging import threading import time -import logging + import numpy as np + from paddle.base.log_helper import get_logger local_logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) -from .trainer_desc import ( # noqa: F401 - MultiTrainer, - DistMultiTrainer, - PipelineTrainer, - HeterXpuTrainer, - PSGPUTrainer, - HeterPipelineTrainer, -) from .device_worker import ( # noqa: F401 - Hogwild, - DownpourSGD, DownpourLite, - Section, + DownpourSGD, DownpourSGDOPT, HeterSection, + Hogwild, + Section, ) from .framework import Variable +from .trainer_desc import ( # noqa: F401 + DistMultiTrainer, + HeterPipelineTrainer, + HeterXpuTrainer, + MultiTrainer, + PipelineTrainer, + PSGPUTrainer, +) __all__ = ["TrainerFactory", "FetchHandlerMonitor"] diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py index 745675767f150..c240273da890d 100644 --- a/python/paddle/base/unique_name.py +++ b/python/paddle/base/unique_name.py @@ -13,6 +13,7 @@ # limitations under the License. import collections + from .wrapped_decorator import signature_safe_contextmanager __all__ = ['generate', 'switch', 'guard'] @@ -121,7 +122,7 @@ def generate(key): # NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode, # in order to keep name consistency. def generate_with_ignorable_key(key): - from .framework import in_dygraph_mode, _dygraph_tracer + from .framework import _dygraph_tracer, in_dygraph_mode if in_dygraph_mode(): return _dygraph_tracer()._generate_unique_name() diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py index 1b3039c5a8cbe..dcc87b74ea658 100644 --- a/python/paddle/base/variable_index.py +++ b/python/paddle/base/variable_index.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import itertools +import warnings + import numpy as np -from . import unique_name -from . import core + import paddle -import warnings -import itertools +from . import core, unique_name MAX_INTEGER = 2**31 - 1 @@ -370,9 +371,7 @@ def _setitem_for_tensor_array(var, item, value): not paddle.in_dynamic_mode() ), "setitem for tensor_array must be called in static graph mode." if isinstance(item, (Variable, int)): - from paddle.jit.dy2static.variable_trans_func import ( - to_static_variable, - ) + from paddle.jit.dy2static.variable_trans_func import to_static_variable from paddle.tensor import array_write item = paddle.cast(to_static_variable(item), dtype='int64') @@ -388,7 +387,8 @@ def _setitem_for_tensor_array(var, item, value): def _setitem_impl_(var, item, value): from paddle.base import core - from .framework import default_main_program, Variable + + from .framework import Variable, default_main_program if var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: return _setitem_for_tensor_array(var, item, value) @@ -572,9 +572,7 @@ def _setitem_impl_(var, item, value): if not paddle.in_dynamic_mode(): # map var to the new output - from paddle.jit.dy2static.program_translator import ( - ProgramTranslator, - ) + from paddle.jit.dy2static.program_translator import ProgramTranslator ProgramTranslator.get_instance()._inplace_map.add( cur_block.program, var.desc.id(), output @@ -601,8 +599,8 @@ def set_value_for_bool_tensor(var, item, value): ) def idx_not_empty(var, item, value): - from .framework import Variable from ..tensor import gather_nd, scatter_nd_add + from .framework import Variable if not isinstance(value, Variable): value = paddle.assign(value).cast(var.dtype) @@ -826,7 +824,7 @@ def _setitem_static(x, indices, values): indices(int|slice|None|Tensor|List|Tuple...): Indices, used to indicate the position of the element to be fetched. values(Tensor|Number|Ndarray): values to be assigned to the x. """ - from .framework import default_main_program, Variable + from .framework import Variable, default_main_program if x.type == paddle.base.core.VarDesc.VarType.LOD_TENSOR_ARRAY: return _setitem_for_tensor_array(x, indices, values) diff --git a/python/paddle/base/wrapped_decorator.py b/python/paddle/base/wrapped_decorator.py index 7e7dbff65611e..1567bb0d4c55c 100644 --- a/python/paddle/base/wrapped_decorator.py +++ b/python/paddle/base/wrapped_decorator.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import decorator import contextlib +import decorator + __all__ = ['wrap_decorator', 'signature_safe_contextmanager'] From eccee58b71d66c041b7c6c2554f1b83976eb4d9b Mon Sep 17 00:00:00 2001 From: Ghost Screaming Date: Thu, 21 Sep 2023 11:10:14 +0800 Subject: [PATCH 21/39] [AutoParallel] Support new communication library for hogwild_worker, graph_helper, data_norm_op and margin_cross_entropy_op. (#57519) --- paddle/fluid/framework/hogwild_worker.cc | 69 +++++++-- paddle/fluid/framework/ir/graph_helper.cc | 17 +- paddle/fluid/operators/data_norm_op.cu | 115 +++++++++++--- .../operators/margin_cross_entropy_op.cu | 145 +++++++++++++----- .../core/distributed/comm_context_manager.cc | 14 ++ .../core/distributed/comm_context_manager.h | 8 + 6 files changed, 292 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index cc2c70506a34c..e638fbcb8a54d 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -22,6 +22,13 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/lodtensor_printer.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" +#include "paddle/phi/core/flags.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/phi/core/distributed/nccl_comm_context.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); +#endif #if defined PADDLE_WITH_PSCORE #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" @@ -30,7 +37,6 @@ limitations under the License. */ #if defined(PADDLE_WITH_GLOO) #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif -#include "paddle/phi/core/flags.h" PHI_DECLARE_bool(enable_exit_when_partial_worker); @@ -152,16 +158,59 @@ bool HogwildWorker::CheckBatchNum(int flag) { } g_barrier.wait(); float *stat_ptr = sync_stat_.data(); - auto comm = - platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId()); + int nranks = 0; + int ring_id = 0; + platform::NCCLComm *comm = nullptr; + const auto &comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + phi::distributed::NCCLCommContext *comm_ctx = nullptr; + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + nranks = comm_ctx->GetSize(); + } else { + comm = platform::NCCLCommContext::Instance().Get(ring_id, + place_.GetDeviceId()); + nranks = comm->nranks(); + } + auto stream = static_cast(dev_ctx_)->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], - &stat_ptr[2], - 1, - ncclFloat32, - ncclProd, - comm->comm(), - stream)); + if (comm_ctx) { + // comm_ctx->AllReduce only support allreduce on the whole tensor, + // single element is not supported now. + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllReduce(&stat_ptr[flag], + &stat_ptr[2], + 1, + ncclFloat32, + ncclProd, + comm_ctx->GetNcclComm(), + stream)); + + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag], + &stat_ptr[2], + 1, + ncclFloat32, + ncclProd, + comm->comm(), + stream)); + } + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output &stat_ptr[2], sizeof(float), diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index b322e3f8bce28..5d7054721db53 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -23,10 +23,14 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_utils.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/fluid/platform/flags.h" PD_DECLARE_bool(convert_all_blocks); @@ -564,9 +568,16 @@ void ReplaceAllReduceOp(const Node &node, all_reduce_op_desc.SetType("c_allreduce_sum"); all_reduce_op_desc.SetInput("X", {all_reduce_var_name}); all_reduce_op_desc.SetOutput("Out", {all_reduce_var_name}); - - int ring_id = platform::NCCLCommContext::Instance().GetRingId( - dynamic_cast(&op_handle)->GetComm()); + int ring_id = -1; + if (FLAGS_dynamic_static_unified_comm) { + ring_id = phi::distributed::CommContextManager::GetInstance().GetRingId( + dynamic_cast(&op_handle)->GetComm()); + VLOG(3) << "New CommContextManager gets ring_id: " << ring_id; + } else { + ring_id = platform::NCCLCommContext::Instance().GetRingId( + dynamic_cast(&op_handle)->GetComm()); + VLOG(3) << "Old NCCLCommContext gets ring_id: " << ring_id; + } all_reduce_op_desc.SetAttr("ring_id", ring_id); all_reduce_op_desc.SetAttr("use_calc_stream", false); all_reduce_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index a212bc0ee9478..509c067e24e42 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -21,6 +21,10 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif namespace paddle { @@ -213,31 +217,92 @@ class DataNormGradKernel : public framework::OpKernel { if (need_sync_stats) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - reinterpret_cast(d_batch_size), - reinterpret_cast(d_batch_size), - C, - platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())), - ncclSum, - comm->comm(), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - reinterpret_cast(d_batch_sum), - reinterpret_cast(d_batch_sum), - C, - platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())), - ncclSum, - comm->comm(), - stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - reinterpret_cast(d_batch_square_sum), - reinterpret_cast(d_batch_square_sum), - C, - platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())), - ncclSum, - comm->comm(), - stream)); + int rid = 0; + platform::NCCLComm *comm = nullptr; + const auto &comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + phi::distributed::NCCLCommContext *comm_ctx = nullptr; + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_EQ( + comm_context_manager.Has(std::to_string(rid)), + true, + platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE( + comm_ctx, + nullptr, + platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + } else { + comm = paddle::platform::NCCLCommContext::Instance().Get( + rid, ctx.GetPlace()); + } + + if (comm_ctx) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_size), + reinterpret_cast(d_batch_size), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm_ctx->GetNcclComm(), + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_sum), + reinterpret_cast(d_batch_sum), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm_ctx->GetNcclComm(), + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_square_sum), + reinterpret_cast(d_batch_square_sum), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm_ctx->GetNcclComm(), + stream)); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_size), + reinterpret_cast(d_batch_size), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm->comm(), + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_sum), + reinterpret_cast(d_batch_sum), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm->comm(), + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + reinterpret_cast(d_batch_square_sum), + reinterpret_cast(d_batch_square_sum), + C, + platform::ToNCCLDataType( + framework::TransToProtoVarType(x->dtype())), + ncclSum, + comm->comm(), + stream)); + } platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index d741bc5b42549..75ef56accb10b 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -30,6 +30,7 @@ namespace cub = hipcub; #include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h" #include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" @@ -39,6 +40,9 @@ namespace cub = hipcub; #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/phi/backends/gpu/gpu_context.h" @@ -87,21 +91,50 @@ void GetClassInterval(const gpuStream_t& stream, auto task = pg->AllReduce(in_tensor, out_tensor, opts); task->Wait(); } else { - const auto& comm = - paddle::platform::NCCLCommContext::Instance().Get(rid, place); + paddle::platform::NCCLComm* comm = nullptr; + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + phi::distributed::NCCLCommContext* comm_ctx = nullptr; + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), + true, + paddle::platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + paddle::platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + } else { + comm = paddle::platform::NCCLCommContext::Instance().Get(rid, place); + } + // use global calculate stream const auto calcu_stream = static_cast(phi::DeviceContextPool::Instance().Get(place)) ->stream(); - - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - num_classes_per_device_ptr, - num_classes_per_device_ptr, - num_classes_per_device.numel(), - phi::ToNCCLDataType(num_classes_per_device.dtype()), - ncclSum, - comm->comm(), - calcu_stream)); + if (comm_ctx) { + comm_ctx->AllReduce(&num_classes_per_device, + num_classes_per_device, + ncclSum, + calcu_stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + num_classes_per_device_ptr, + num_classes_per_device_ptr, + num_classes_per_device.numel(), + phi::ToNCCLDataType(num_classes_per_device.dtype()), + ncclSum, + comm->comm(), + calcu_stream)); + } } class_interval->Resize({nranks + 1}); @@ -238,7 +271,10 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, const auto& place = dev_ctx.GetPlace(); // old code #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - paddle::platform::NCCLComm* comm; + paddle::platform::NCCLComm* comm = nullptr; + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + phi::distributed::NCCLCommContext* comm_ctx = nullptr; paddle::distributed::ProcessGroup* pg = nullptr; gpuStream_t stream; if (nranks > 1) { @@ -247,8 +283,29 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, // Use ProcessGroup pg = map->get(ring_id); } else { - comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); - + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_EQ( + comm_context_manager.Has(std::to_string(ring_id)), + true, + paddle::platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + PADDLE_ENFORCE_NE( + comm_ctx, + nullptr, + paddle::platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + } else { + comm = + paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); + } // use global calculate stream stream = static_cast( phi::DeviceContextPool::Instance().Get(place)) @@ -361,14 +418,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, auto task = pg->AllReduce(in_tensor, out_tensor, opts); task->Wait(); } else { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(logits_max_buff, - logits_max_buff, - logits_max.numel(), - phi::ToNCCLDataType(logits_max.dtype()), - ncclMax, - comm->comm(), - stream)); + if (comm_ctx) { + comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllReduce(logits_max_buff, + logits_max_buff, + logits_max.numel(), + phi::ToNCCLDataType(logits_max.dtype()), + ncclMax, + comm->comm(), + stream)); + } } } #endif @@ -402,14 +463,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, auto task = pg->AllReduce(in_tensor, out_tensor, opts); task->Wait(); } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - sum_exp_logits_buff, - sum_exp_logits_buff, - sum_exp_logits.numel(), - phi::ToNCCLDataType(sum_exp_logits.dtype()), - ncclSum, - comm->comm(), - stream)); + if (comm_ctx) { + comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + sum_exp_logits_buff, + sum_exp_logits_buff, + sum_exp_logits.numel(), + phi::ToNCCLDataType(sum_exp_logits.dtype()), + ncclSum, + comm->comm(), + stream)); + } } } #endif @@ -460,14 +525,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, auto task = pg->AllReduce(in_tensor, out_tensor, opts); task->Wait(); } else { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::ncclAllReduce(loss_ptr, - loss_ptr, - loss->numel(), - phi::ToNCCLDataType(loss->dtype()), - ncclSum, - comm->comm(), - stream)); + if (comm_ctx) { + comm_ctx->AllReduce(loss, *loss, ncclSum, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllReduce(loss_ptr, + loss_ptr, + loss->numel(), + phi::ToNCCLDataType(loss->dtype()), + ncclSum, + comm->comm(), + stream)); + } } } #endif diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc index e7931282724ab..342a86313bf3f 100644 --- a/paddle/phi/core/distributed/comm_context_manager.cc +++ b/paddle/phi/core/distributed/comm_context_manager.cc @@ -176,6 +176,20 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const { return id_to_comm_context_.at(unique_comm_key).get(); } +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +int CommContextManager::GetRingId(const ncclComm_t& comm) const { + for (auto iter = id_to_comm_context_.begin(); + iter != id_to_comm_context_.end(); + ++iter) { + if (static_cast(iter->second.get()) + ->GetNcclComm() == comm) { + return std::stoi(iter->first); + } + } + return -1; +} +#endif + bool CommContextManager::Has(const std::string& unique_comm_key) const { return id_to_comm_context_.find(unique_comm_key) != id_to_comm_context_.end(); } diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h index e2cb298a0984b..dcbfaab55af90 100644 --- a/paddle/phi/core/distributed/comm_context_manager.h +++ b/paddle/phi/core/distributed/comm_context_manager.h @@ -22,6 +22,10 @@ #include "paddle/phi/core/distributed/comm_context.h" #include "paddle/phi/core/macros.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/phi/backends/gpu/forwards.h" +#endif + namespace phi { namespace distributed { @@ -44,6 +48,10 @@ class CommContextManager { CommContext* Get(const std::string& unique_comm_key) const; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + int GetRingId(const ncclComm_t& comm) const; +#endif + bool Has(const std::string& unique_comm_key) const; static void SetDeviceId(int dev_id); From b1536e78833f22d1833cfb1171c3e6cb364e7a09 Mon Sep 17 00:00:00 2001 From: iLeGend <824040212@qq.com> Date: Thu, 21 Sep 2023 11:11:08 +0800 Subject: [PATCH 22/39] [NewComm] No.9 compatiable upgrade for fused_attention op (#57560) * [NewComm] No.9 compatiable upgrade for fused_attention op * fix error * fix error --- .../operators/fused/fused_attention_utils.h | 50 +++++++++++++++++-- test/legacy_test/test_fused_attention_op.py | 10 ++++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h index 26cab895f0dfc..c059a194d0ea5 100644 --- a/paddle/fluid/operators/fused/fused_attention_utils.h +++ b/paddle/fluid/operators/fused/fused_attention_utils.h @@ -18,8 +18,13 @@ #include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/flags.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif +#include "paddle/fluid/distributed/collective/utils.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/errors.h" namespace phi { @@ -47,11 +52,46 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT auto place = dev_ctx.GetPlace(); void *recvbuff = dev_ctx.template Alloc(&tensor, tensor.numel() * sizeof(T)); - auto comm = - paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = dev_ctx.stream(); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + gpuStream_t stream = nullptr; + paddle::platform::NCCLComm *comm = nullptr; + phi::distributed::NCCLCommContext *comm_ctx = nullptr; + + const auto &comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + + if (FLAGS_dynamic_static_unified_comm) { + // Use New Communication Library + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + paddle::platform::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + paddle::platform::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + + stream = comm_ctx->GetStream(); + VLOG(3) << "new comm_context_manager has ring_id" << ring_id; + } else { + comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); + + stream = dev_ctx.stream(); + VLOG(3) << "old NCCLCommContext has ring_id " << ring_id; + } + if (comm_ctx) { + comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + } } #else PADDLE_THROW(phi::errors::Unimplemented( diff --git a/test/legacy_test/test_fused_attention_op.py b/test/legacy_test/test_fused_attention_op.py index af734c96d19d8..0e012659f95f6 100644 --- a/test/legacy_test/test_fused_attention_op.py +++ b/test/legacy_test/test_fused_attention_op.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import unittest import numpy as np @@ -31,6 +32,7 @@ class TestFusedAttentionOp(OpTest): def setUp(self): + self.with_new_comm() self.config() self.generate_input_data() @@ -79,6 +81,9 @@ def setUp(self): paddle.set_default_dtype(self.x_type) self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train") + def with_new_comm(self): + os.environ["FLAGS_dynamic_static_unified_comm"] = "0" + def config(self): self.x_type = np.float32 self.attn_mask_type = np.float64 @@ -350,6 +355,11 @@ def test_fused_attention_op(self): ) +class TestFusedAttentionOpWithNewComm(TestFusedAttentionOp): + def with_new_comm(self): + os.environ["FLAGS_dynamic_static_unified_comm"] = "1" + + class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp): def config(self): super().config() From 6e9143181a8c4ba7253be9690f198cec8326e5a4 Mon Sep 17 00:00:00 2001 From: ooo oo <106524776+ooooo-create@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:12:43 +0800 Subject: [PATCH 23/39] [CodeStyle][task 11] enable Ruff F403 rule in `python/paddle/base/__init__.py` (#57501) --- pyproject.toml | 2 -- python/paddle/base/__init__.py | 47 +++++++++++++++++++++++--- python/paddle/base/core.py | 2 +- python/paddle/base/dygraph/__init__.py | 13 +++++-- python/paddle/base/io.py | 5 ++- 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8dd98b65873aa..eca2770cb1b4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,8 +109,6 @@ ignore = [ "UP031", "C408", "UP030", - "F522", - "F403", "C405", "C417", "PLR0402", diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py index 6eec276eee03d..acc6f9f51ae2f 100644 --- a/python/paddle/base/__init__.py +++ b/python/paddle/base/__init__.py @@ -34,17 +34,48 @@ # import all class inside framework into base module from . import framework -from .framework import * # noqa: F403 +from .framework import ( + Program, + default_startup_program, + default_main_program, + program_guard, + name_scope, + ipu_shard_guard, + set_ipu_shard, + cuda_places, + cpu_places, + xpu_places, + cuda_pinned_places, + in_dygraph_mode, + in_pir_mode, + in_dynamic_or_pir_mode, + is_compiled_with_cinn, + is_compiled_with_cuda, + is_compiled_with_rocm, + is_compiled_with_xpu, + Variable, + require_version, + device_guard, + set_flags, + get_flags, +) # import all class inside executor into base module from . import executor -from .executor import * # noqa: F403 +from .executor import ( + Executor, + global_scope, + scope_guard, +) from . import data_feed_desc -from .data_feed_desc import * # noqa: F403 +from .data_feed_desc import DataFeedDesc from . import dataset -from .dataset import * # noqa: F403 +from .dataset import ( + DatasetFactory, + InMemoryDataset, +) from . import trainer_desc @@ -72,7 +103,13 @@ from . import unique_name from . import compiler -from .compiler import * # noqa: F403 +from .compiler import ( + CompiledProgram, + ExecutionStrategy, + BuildStrategy, + IpuCompiledProgram, + IpuStrategy, +) from paddle.base.layers.math_op_patch import monkey_patch_variable from .dygraph.base import enable_dygraph, disable_dygraph from .dygraph.tensor_patch_methods import monkey_patch_tensor diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py index df90a6ace8582..285a9f1b1a61b 100644 --- a/python/paddle/base/core.py +++ b/python/paddle/base/core.py @@ -278,7 +278,7 @@ def to_list(s): # assign tensor alias libpaddle.LoDTensor = libpaddle.Tensor - from .libpaddle import * + from .libpaddle import * # noqa: F403 from .libpaddle import ( # noqa: F401 __doc__, __file__, diff --git a/python/paddle/base/dygraph/__init__.py b/python/paddle/base/dygraph/__init__.py index 6355ca337b9f8..2ac4df711681c 100644 --- a/python/paddle/base/dygraph/__init__.py +++ b/python/paddle/base/dygraph/__init__.py @@ -13,10 +13,19 @@ # limitations under the License. from . import base -from .base import * # noqa: F403 +from .base import ( + no_grad, + no_grad_, + grad, + guard, + enable_dygraph, + disable_dygraph, + enabled, + to_variable, +) from . import tracer -from .tracer import * # noqa: F403 +from .tracer import Tracer __all__ = [] diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py index a2c7d02ede349..55f5c072f4e27 100644 --- a/python/paddle/base/io.py +++ b/python/paddle/base/io.py @@ -17,7 +17,10 @@ from paddle.base.log_helper import get_logger from . import reader -from .reader import * +from .reader import ( # noqa: F401 + PyReader, + DataLoader, +) __all__ = reader.__all__ From 9650cf907fe3d574215e2949785075478096b8d9 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 21 Sep 2023 11:20:02 +0800 Subject: [PATCH 24/39] [PIR]Rename flags (#57496) * rename flag * fix py3 bugs * modify demo code --- paddle/fluid/framework/feed_fetch_method.cc | 2 +- .../new_executor/standalone_executor.cc | 8 +++--- .../tensor_operants_gen.py | 24 ++++++++--------- paddle/phi/core/flags.cc | 6 ++--- python/paddle/base/framework.py | 8 +++--- python/paddle/pir_utils.py | 26 +++++++++---------- test/ir/new_ir/CMakeLists.txt | 2 +- test/ir/new_ir/test_ir_backward.py | 6 ++--- test/prim/new_ir_prim/CMakeLists.txt | 2 +- 9 files changed, 41 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 1f2f645f97dc8..7a62b5563f30a 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "glog/logging.h" PHI_DECLARE_bool(enable_new_ir_in_executor); -PHI_DECLARE_bool(enable_new_ir_api); +PHI_DECLARE_bool(enable_pir_api); namespace phi { class DenseTensor; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 99b42bee8b73f..f06bee2c884e3 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -28,7 +28,7 @@ #include "paddle/pir/pass/pass_manager.h" PHI_DECLARE_bool(enable_new_ir_in_executor); -PHI_DECLARE_bool(enable_new_ir_api); +PHI_DECLARE_bool(enable_pir_api); PHI_DECLARE_bool(new_ir_apply_inplace_pass); namespace paddle { @@ -55,7 +55,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, const std::string& job_type = job->Type(); std::shared_ptr program = nullptr; std::shared_ptr<::pir::Program> ir_program = nullptr; - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { ir_program = plan_.IrProgram(job_type); } else { program = std::make_shared(*(plan_.Program(job_type))); @@ -69,7 +69,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, micro_batch_id, micro_batch_num)); - if (micro_batch_num > 1 && !FLAGS_enable_new_ir_api) { + if (micro_batch_num > 1 && !FLAGS_enable_pir_api) { SetColAttrForFeedFetchOps(program, micro_batch_num, micro_batch_id); } @@ -80,7 +80,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, // TODO(phlrain) we only support cpu for now if (FLAGS_enable_new_ir_in_executor) { std::shared_ptr<::pir::Program> base_program = ir_program; - if (!FLAGS_enable_new_ir_api) { + if (!FLAGS_enable_pir_api) { VLOG(6) << "begin to translate" << std::endl; base_program = paddle::TranslateLegacyProgramToProgram(*program); } diff --git a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py index 783066f0fc906..0bc050f00d4a0 100644 --- a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py +++ b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py @@ -214,7 +214,7 @@ class StaticTensorOperants : public TensorOperantsBase { #include "paddle/fluid/primitive/backend/backend.h" #include "paddle/fluid/primitive/type/lazy_tensor.h" -PHI_DECLARE_bool(enable_new_ir_api); +PHI_DECLARE_bool(enable_pir_api); """ @@ -227,7 +227,7 @@ class StaticTensorOperants : public TensorOperantsBase { using LazyTensor = paddle::primitive::LazyTensor; Tensor StaticTensorOperants::add(const Tensor& x, const Scalar& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::add(x, paddle::primitive::backend::full(x.shape(), y, x.dtype(), x.place())); } else { return paddle::prim::add(x, paddle::prim::full(x.shape(), y, x.dtype(), x.place())); @@ -235,7 +235,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::subtract(const Tensor& x, const Scalar& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::subtract(x, paddle::primitive::backend::full(x.shape(), y, x.dtype(), x.place())); } else { return paddle::prim::subtract(x, paddle::prim::full(x.shape(), y, x.dtype(), x.place())); @@ -243,7 +243,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::multiply(const Tensor& x, const Scalar& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::scale(x, y, 0.0f, true); } else { return paddle::prim::scale(x, y, 0.0f, true); @@ -251,7 +251,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::divide(const Tensor& x, const Scalar& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::divide(x, paddle::primitive::backend::full(x.shape(), y, x.dtype(), x.place())); } else { return paddle::prim::divide(x, paddle::prim::full(x.shape(), y, x.dtype(), x.place())); @@ -259,7 +259,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::add(const Scalar& x, const Tensor& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::add(paddle::primitive::backend::full(y.shape(), x, y.dtype(), y.place()), y); } else { return paddle::prim::add(paddle::prim::full(y.shape(), x, y.dtype(), y.place()), y); @@ -268,7 +268,7 @@ class StaticTensorOperants : public TensorOperantsBase { Tensor StaticTensorOperants::subtract(const Scalar& x, const Tensor& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::subtract(paddle::primitive::backend::full(y.shape(), x, y.dtype(), y.place()), y); } else { return paddle::prim::subtract(paddle::prim::full(y.shape(), x, y.dtype(), y.place()), y); @@ -276,7 +276,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::multiply(const Scalar& x, const Tensor& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::scale(y, x, 0.0f, true); } else { return paddle::prim::scale(y, x, 0.0f, true); @@ -284,7 +284,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::divide(const Scalar& x, const Tensor& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::divide(paddle::primitive::backend::full(y.shape(), x, y.dtype(), y.place()), y); } else { return paddle::prim::divide(paddle::prim::full(y.shape(), x, y.dtype(), y.place()), y); @@ -292,7 +292,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::pow(const Tensor& x, const Tensor& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::elementwise_pow(x, y); } else { return paddle::prim::elementwise_pow(x, y); @@ -300,7 +300,7 @@ class StaticTensorOperants : public TensorOperantsBase { } Tensor StaticTensorOperants::pow(const Tensor& x, const Scalar& y) { - if (FLAGS_enable_new_ir_api) { + if (FLAGS_enable_pir_api) { return paddle::primitive::backend::elementwise_pow(x, paddle::primitive::backend::full(x.shape(), y, x.dtype(), x.place())); } else { return paddle::prim::elementwise_pow(x, paddle::prim::full(x.shape(), y, x.dtype(), x.place())); @@ -393,7 +393,7 @@ def gene_static_tensor_func_call(self): ) static_func_parameters = self.get_func_args() - static_tensor_func_call = f"""if (FLAGS_enable_new_ir_api) {{ + static_tensor_func_call = f"""if (FLAGS_enable_pir_api) {{ return {backend_static_func_name}({static_func_parameters}); }} else {{ return {prim_static_func_name}({static_func_parameters}); diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index e02868d5e2c1b..ce03cdb3f4d69 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1278,15 +1278,13 @@ PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor, /** * Using new IR API in Python - * Name: enable_new_ir_api + * Name: enable_pir_api * Since Version: 2.6.0 * Value Range: bool, default=false * Example: * Note: If Ture, New IR API will be used in Python */ -PHI_DEFINE_EXPORTED_bool(enable_new_ir_api, - false, - "Enable new IR API in Python"); +PHI_DEFINE_EXPORTED_bool(enable_pir_api, false, "Enable new IR API in Python"); /** * Using new IR in executor FLAG diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 0440af415a7d0..d3f17ea6435e9 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -162,8 +162,8 @@ def __init__(self): self._in_to_static_mode_ = False self._functional_dygraph_context_manager = None self._dygraph_tracer_ = _dygraph_tracer_ - self._use_pir_api_ = get_flags("FLAGS_enable_new_ir_api")[ - 'FLAGS_enable_new_ir_api' + self._use_pir_api_ = get_flags("FLAGS_enable_pir_api")[ + 'FLAGS_enable_pir_api' ] def __str__(self): @@ -340,8 +340,8 @@ def in_dynamic_or_pir_mode(): >>> print(paddle.framework.in_dynamic_or_pir_mode()) False - >>> paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True}) - >>> print(paddle.framework.in_dynamic_or_pir_mode()) + >>> with paddle.pir_utils.IrGuard(): + ... print(paddle.framework.in_dynamic_or_pir_mode()) True """ diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py index a62fe6f61a924..9af825cfcd88b 100644 --- a/python/paddle/pir_utils.py +++ b/python/paddle/pir_utils.py @@ -19,11 +19,11 @@ class IrGuard: def __init__(self): self.in_dygraph_outside = False - old_flag = paddle.base.framework.get_flags("FLAGS_enable_new_ir_api") - paddle.base.framework.set_flags({"FLAGS_enable_new_ir_api": False}) + old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api") + paddle.base.framework.set_flags({"FLAGS_enable_pir_api": False}) paddle.base.framework.global_var._use_pir_api_ = False - if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[ - "FLAGS_enable_new_ir_api" + if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[ + "FLAGS_enable_pir_api" ]: self.old_Program = paddle.static.Program self.old_program_guard = paddle.base.program_guard @@ -34,31 +34,31 @@ def __init__(self): else: raise RuntimeError( "IrGuard only init when paddle.framework.in_pir_mode(): is false, \ - please set FLAGS_enable_new_ir_api = false" + please set FLAGS_enable_pir_api = false" ) paddle.base.framework.set_flags(old_flag) paddle.base.framework.global_var._use_pir_api_ = old_flag[ - "FLAGS_enable_new_ir_api" + "FLAGS_enable_pir_api" ] def __enter__(self): self.in_dygraph_outside = paddle.base.framework.in_dygraph_mode() if self.in_dygraph_outside: paddle.enable_static() - paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True}) + paddle.framework.set_flags({"FLAGS_enable_pir_api": True}) paddle.base.framework.global_var._use_pir_api_ = True self._switch_to_pir() def __exit__(self, exc_type, exc_val, exc_tb): - paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False}) + paddle.framework.set_flags({"FLAGS_enable_pir_api": False}) paddle.base.framework.global_var._use_pir_api_ = False self._switch_to_old_ir() if self.in_dygraph_outside: paddle.disable_static() def _switch_to_pir(self): - if paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[ - "FLAGS_enable_new_ir_api" + if paddle.base.framework.get_flags("FLAGS_enable_pir_api")[ + "FLAGS_enable_pir_api" ]: paddle.framework.set_flags( {"FLAGS_enable_new_ir_in_executor": True} @@ -76,8 +76,8 @@ def _switch_to_pir(self): ) def _switch_to_old_ir(self): - if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[ - "FLAGS_enable_new_ir_api" + if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[ + "FLAGS_enable_pir_api" ]: paddle.framework.set_flags( {"FLAGS_enable_new_ir_in_executor": False} @@ -93,5 +93,5 @@ def _switch_to_old_ir(self): else: raise RuntimeError( "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \ - please set FLAGS_enable_new_ir_api = false" + please set FLAGS_enable_pir_api = false" ) diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt index e213eaba4c53c..75587db97c088 100644 --- a/test/ir/new_ir/CMakeLists.txt +++ b/test/ir/new_ir/CMakeLists.txt @@ -15,7 +15,7 @@ foreach(target ${TEST_INTERP_CASES}) endforeach() foreach(target ${TEST_IR_SYSTEM_CASES}) - py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_new_ir_api=true) + py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_pir_api=true) endforeach() set_tests_properties(test_pd_inplace_pass PROPERTIES TIMEOUT 60) diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py index acffcf4ee28d6..c604290d34cad 100644 --- a/test/ir/new_ir/test_ir_backward.py +++ b/test/ir/new_ir/test_ir_backward.py @@ -38,7 +38,7 @@ def get_ir_program_0(): class TesBackward_1(unittest.TestCase): def tearDown(self) -> None: - paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False}) + paddle.framework.set_flags({"FLAGS_enable_pir_api": False}) def test_grad(self): newir_program = get_ir_program_0() @@ -155,7 +155,7 @@ def get_ir_program_1(): class TesBackward_2(unittest.TestCase): def tearDown(self) -> None: - paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False}) + paddle.framework.set_flags({"FLAGS_enable_pir_api": False}) def test_add_n(self): newir_program = get_ir_program_1() @@ -231,7 +231,7 @@ def get_ir_program_2(): class TestBackward_3(unittest.TestCase): def tearDown(self) -> None: - paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False}) + paddle.framework.set_flags({"FLAGS_enable_pir_api": False}) def test_basic_network(self): newir_program = get_ir_program_2() diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt index 1b37b432d2052..a36e905e0c9f4 100644 --- a/test/prim/new_ir_prim/CMakeLists.txt +++ b/test/prim/new_ir_prim/CMakeLists.txt @@ -3,7 +3,7 @@ set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES}) py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1 - FLAGS_enable_new_ir_api=true) + FLAGS_enable_pir_api=true) endforeach() file( From c882037892eaa80250a2e06b3f032326a1629661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?= <39978853+zhoutianzi666@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:24:42 +0800 Subject: [PATCH 25/39] remove SetTensorDynamicRange in softmax (#57538) --- paddle/fluid/inference/tensorrt/convert/softmax_op.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 8e101075768e0..9aefd7fb28b39 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -84,8 +84,6 @@ class SoftMaxOpConverter : public OpConverter { } layer->setAxes(1 << axes); - // The trt will not run int for softmax. - engine_->SetTensorDynamicRange(input1, 1.0); auto output_name = op_desc.Output("Out")[0]; // support 0 or 1 dims input From 5be4e463cde24dec8cd0cb60833224022f24f90e Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 21 Sep 2023 12:13:30 +0800 Subject: [PATCH 26/39] [PIR]Fix arange op and assign op bug (#57494) * fix arange kernel selected bug * revert some code * fix compile bug --- .../fluid/ir_adaptor/translator/op_translator.cc | 4 ++-- .../fluid/pir/transforms/pd_op_to_kernel_pass.cc | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index e3eeaab4f7d48..b11101de616b8 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -754,8 +754,8 @@ struct AssignValueOpTranscriber : public OpTranscriber { attribute_translator(attr_info_maps.at("dtype").type_name, legacy_attr); attribute_map["dtype"] = attr_dtype; - pir::Attribute attr_place = - dialect::PlaceAttribute::get(ctx, phi::CPUPlace()); + pir::Attribute attr_place = dialect::PlaceAttribute::get( + ctx, phi::Place(phi::AllocationType::UNDEFINED)); attribute_map["place"] = attr_place; int dtype = paddle::get(op_desc.GetAttr("dtype")); diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index d77161992c311..79e6bbe71230e 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -401,7 +401,8 @@ phi::DataType GetKernelDataTypeByYamlInfo( phi::Backend GetKernelBackendByYamlInfo( const pir::Operation* op, const std::unordered_map& map_value_pair, - const dialect::OpYamlInfoParser* op_info_parser) { + const dialect::OpYamlInfoParser* op_info_parser, + const phi::Place& place) { auto& attr_map = op->attributes(); auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend; phi::Backend kernel_backend = phi::Backend::UNDEFINED; @@ -465,6 +466,10 @@ phi::Backend GetKernelBackendByYamlInfo( } } + if (backend_info.size() > 0 && kernel_backend == phi::Backend::UNDEFINED) { + kernel_backend = paddle::experimental::ParseBackend(place); + } + return kernel_backend; } @@ -518,7 +523,7 @@ phi::KernelKey GetKernelKey( kernel_data_type = GetKernelDataTypeByYamlInfo(op, map_value_pair, op_info_parser); kernel_backend = - GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser); + GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser, place); // parse all the input tensor if (tensor_input_number == 0 || op->isa()) { @@ -550,7 +555,9 @@ phi::KernelKey GetKernelKey( } } - if (op->num_operands() > 0) { + if ((kernel_backend == phi::Backend::UNDEFINED || + kernel_data_type == phi::DataType::UNDEFINED) && + op->num_operands() > 0) { paddle::experimental::detail::KernelKeyParser kernel_key_parser; for (size_t i = 0; i < op->num_operands(); ++i) { @@ -724,7 +731,7 @@ void HandleForSpecialOp( pir::IrContext* ctx, std::unordered_map* map_op_pair, std::unordered_map* map_value_pair) { - if (op_item->name() == "pd_op.if") { + if (op_item->isa()) { HandleForIfOp(place, op_item, block, ctx, map_op_pair, map_value_pair); return; } From b13dcb85918bb467ebe557093e22bc2482479c93 Mon Sep 17 00:00:00 2001 From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:13:23 +0800 Subject: [PATCH 27/39] support pir jit prim (#57561) --- .../jit/dy2static/newir_partial_program.py | 32 +++++++--- .../jit/dy2static/program_translator.py | 60 +++++++++++++++++-- test/prim/new_ir_prim/CMakeLists.txt | 2 +- test/prim/new_ir_prim/test_prim_jit.py | 58 ++++++++++++++++++ 4 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 test/prim/new_ir_prim/test_prim_jit.py diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py index 83cb5eed92534..c0da8f35c822a 100644 --- a/python/paddle/jit/dy2static/newir_partial_program.py +++ b/python/paddle/jit/dy2static/newir_partial_program.py @@ -642,11 +642,15 @@ def _insert_aggregation_ops_for_var(target_program, var): @switch_to_static_graph def _append_backward_desc(self, main_program): program = main_program - # if self._hooker: - # program = self._hooker.before_append_backward(program) + targets = list( filter(lambda x: isinstance(x, OpResult), self._outputs.tolist()) ) + if self._hooker: + program, targets = self._hooker.before_append_backward( + program, targets + ) + self._outputs = NestSequence(targets, need_check=True) inputs = list( filter(lambda x: isinstance(x, OpResult), self._inputs.tolist()) ) @@ -676,11 +680,15 @@ def _append_backward_desc(self, main_program): forward_outputs_grads.append(opres) not_stop_gradient_num += 1 - # TODO: add later. - # if self._hooker: - # program, start_idx = self._hooker.after_append_backward( - # program, start_idx - # ) + if self._hooker: + ( + program, + forward_end_idx, + targets, + ) = self._hooker.after_append_backward( + program, targets, forward_end_idx + ) + self._outputs = NestSequence(targets, need_check=True) # TODO: add later # self.prepare_gradient_aggregation( @@ -692,6 +700,8 @@ def _append_backward_desc(self, main_program): ) hash_id = paddle.utils._hash_with_id(program, self) extra_info = self._program_extra_info.get(hash_id, {}) + extra_info['forward_inputs'] = inputs + extra_info['forward_outputs'] = targets extra_info['forward_end_op_idx'] = forward_end_idx extra_info['forward_inputs_grads'] = list( map(mapping_op_result, grad_info_map) @@ -791,8 +801,10 @@ def _get_forward_backward_program_form( forward_inputs_grads = self.get_program_extra(whole_program)[ 'forward_inputs_grads' ] - forward_inputs = self._inputs.tolist() - forward_outputs = self._outputs.tolist() + forward_inputs = self.get_program_extra(whole_program)['forward_inputs'] + forward_outputs = self.get_program_extra(whole_program)[ + 'forward_outputs' + ] forward_outputs_grads = self.get_program_extra(whole_program)[ 'forward_outputs_grads' ] @@ -947,9 +959,11 @@ def create_out(var_id): tensor_type = paddle.dtype(8) # SELECT ROW TENSOR # TODO(xiongkun): more elegent way to do it. + ir_dtype_2_tensor_dtype = { 10: paddle.dtype(5), } + out = core.eager.Tensor( ir_dtype_2_tensor_dtype[int(var.dtype)], var.shape, diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index 592665596cfef..8eb118852a764 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -21,6 +21,7 @@ import weakref import paddle.ir.core as ir_static +from paddle import decomposition from paddle.base import core, framework from paddle.base.data_feeder import check_type from paddle.base.dygraph.base import ( @@ -42,6 +43,9 @@ get_buffers, get_parameters, ) +from .newir_partial_program import ( + PartialProgramLayerHook as PirPartialProgramLayerHook, +) from .origin_info import ( attach_origin_info, create_and_update_origin_info_map, @@ -1473,6 +1477,46 @@ def __setattr__(self, key, value): return super().__setattr__(key, value) +class PirPrimHooker(PirPartialProgramLayerHook): + def __init__(self, original_program, backend): + self.backend = backend + self.custom_vjps = set() + with backend_guard(self.backend): + if core._is_all_prim_enabled(): + self.custom_vjps = { + op.name() + for op in original_program.global_block().ops + if core.has_custom_vjp(op) + } + + def before_append_backward(self, forward_program, src_vars): + with backend_guard(self.backend): + if core._is_fwd_prim_enabled(): + dst_vars = decomposition.decompose( + forward_program, src_vars, blacklist=self.custom_vjps + ) + return forward_program, dst_vars + + def after_append_backward(self, whole_program, src_vars, forward_end_idx): + with backend_guard(self.backend): + backward_length = ( + len(whole_program.global_block().ops) - forward_end_idx + ) + if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0: + # only process backward part of block + dst_vars = decomposition.decompose(whole_program, src_vars) + new_start_index = ( + len(whole_program.global_block().ops) - backward_length + ) + return whole_program, new_start_index, dst_vars + + def after_infer(self, infer_program, src_vars): + with backend_guard(self.backend): + if core._is_fwd_prim_enabled(): + dst_vars = decomposition.decompose(infer_program, src_vars) + return infer_program, dst_vars + + class ProgramCache: """ Wrapper class for the program functions defined by dygraph function. @@ -1530,7 +1574,10 @@ def _build_once(self, cache_key): raise backend = cache_key.kwargs['backend'] - if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend): + if ( + prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend) + and not use_pir_api() + ): for var in concrete_program.main_program.list_vars(): if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape: warnings.warn( @@ -1553,9 +1600,14 @@ def _build_once(self, cache_key): ) with backend_guard(backend): if core._is_fwd_prim_enabled(): - partial_program.set_hooker( - PrimHooker(concrete_program.main_program, backend) - ) + if use_pir_api(): + partial_program.set_hooker( + PirPrimHooker(concrete_program.main_program, backend) + ) + else: + partial_program.set_hooker( + PrimHooker(concrete_program.main_program, backend) + ) return concrete_program, partial_program def __getitem__(self, item): diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt index a36e905e0c9f4..e1cbcd60f8ee4 100644 --- a/test/prim/new_ir_prim/CMakeLists.txt +++ b/test/prim/new_ir_prim/CMakeLists.txt @@ -1,5 +1,5 @@ set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet - test_prim_custom_vjp) + test_prim_custom_vjp test_prim_jit) foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES}) py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1 diff --git a/test/prim/new_ir_prim/test_prim_jit.py b/test/prim/new_ir_prim/test_prim_jit.py new file mode 100644 index 0000000000000..72958eff9a1d7 --- /dev/null +++ b/test/prim/new_ir_prim/test_prim_jit.py @@ -0,0 +1,58 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.framework import core + + +class TestDy2staticNewIR(unittest.TestCase): + def test_basic_network_backward(self): + core._set_prim_all_enabled(True) + + def func(x): + x1 = paddle.mean(x) + out = paddle.nn.functional.gelu(x1, False) + return out + + # ==== dygraph computation ==== + static_func = paddle.jit.to_static(func) + x = paddle.randn((8, 16, 64)) + x.stop_gradient = False + ref_out = func(x) * 2 + ref_out.backward() + ref_grad = x.grad.numpy() + x.clear_gradient() + + # ==== to static compuatation ==== + out = static_func(x) + actual_out = out * 2 + actual_out.backward() + actual_grad = x.grad + core._set_prim_all_enabled(True) + + np.testing.assert_allclose( + ref_out, actual_out.numpy(), atol=1e-6, rtol=1e-6 + ) + + np.testing.assert_allclose( + ref_grad, actual_grad.numpy(), atol=1e-6, rtol=1e-6 + ) + + +if __name__ == "__main__": + unittest.main() From e24119c3e6ac49486f83fcdafad0ae6844a7633a Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:14:02 +0800 Subject: [PATCH 28/39] [Fix] fix multi device compile error (#57530) Add device_id directory when dumping information. Reduce threads during multi card compilation. --- paddle/cinn/backends/compiler.cc | 41 ++++++++++++++----- paddle/cinn/backends/compiler.h | 19 ++++++--- paddle/cinn/hlir/framework/graph.cc | 10 ++++- .../cinn/hlir/framework/parallel_compiler.cc | 38 ++++++++++++----- .../cinn/hlir/framework/parallel_compiler.h | 11 ++++- 5 files changed, 88 insertions(+), 31 deletions(-) diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc index 448bef2392a9f..a913a3de86692 100644 --- a/paddle/cinn/backends/compiler.cc +++ b/paddle/cinn/backends/compiler.cc @@ -45,7 +45,7 @@ using CompilationStatus = hlir::framework::CompilationStatus; static constexpr int DebugLogMaxLen = 30000; void CompilationInfoDumper::DumpLoweredFuncByGroupIndex( - const ir::LoweredFunc& lowered_func, const int gidx) { + const ir::LoweredFunc& lowered_func, const int gidx, const int device_id) { if (FLAGS_cinn_dump_group_lowered_func.empty() || lowered_func.get() == nullptr) { return; @@ -54,34 +54,42 @@ void CompilationInfoDumper::DumpLoweredFuncByGroupIndex( content << lowered_func; Dump(FLAGS_cinn_dump_group_lowered_func, gidx, + device_id, "lowered_function.txt", content.str()); } void CompilationInfoDumper::DumpSourceCodeByGroupIndex( - const std::string& source_code, const int gidx) { + const std::string& source_code, const int gidx, const int device_id) { if (FLAGS_cinn_dump_group_source_code.empty()) { return; } - Dump(FLAGS_cinn_dump_group_source_code, gidx, "source_code.cu", source_code); + Dump(FLAGS_cinn_dump_group_source_code, + gidx, + device_id, + "source_code.cu", + source_code); } void CompilationInfoDumper::DumpPtxCodeByGroupIndex( - const std::string& source_ptx, const int gidx) { + const std::string& source_ptx, const int gidx, const int device_id) { if (FLAGS_cinn_dump_group_ptx.empty()) { return; } - Dump(FLAGS_cinn_dump_group_ptx, gidx, "source_ptx.ptx", source_ptx); + Dump( + FLAGS_cinn_dump_group_ptx, gidx, device_id, "source_ptx.ptx", source_ptx); } void CompilationInfoDumper::DumpInstructionByGroupIndex( const std::unique_ptr& instr, - const int gidx) { + const int gidx, + const int device_id) { if (FLAGS_cinn_dump_group_instruction.empty() || instr.get() == nullptr) { return; } Dump(FLAGS_cinn_dump_group_instruction, gidx, + device_id, "instruction.txt", instr->DumpInstruction()); } @@ -99,6 +107,7 @@ void CompilationInfoDumper::DumpLoweredFunc() { } Dump(FLAGS_cinn_dump_group_lowered_func, idx, + device_id_, "lowered_function.txt", content.str()); } @@ -115,7 +124,11 @@ void CompilationInfoDumper::DumpSourceCode() { } else { dump_str = "[No source code generated]\n\n" + info_.Message(idx); } - Dump(FLAGS_cinn_dump_group_source_code, idx, "source_code.cu", dump_str); + Dump(FLAGS_cinn_dump_group_source_code, + idx, + device_id_, + "source_code.cu", + dump_str); } } @@ -130,7 +143,8 @@ void CompilationInfoDumper::DumpPtxCode() { } else { dump_str = "[No source ptxs generated]\n\n" + info_.Message(idx); } - Dump(FLAGS_cinn_dump_group_ptx, idx, "source_ptx.ptx", dump_str); + Dump( + FLAGS_cinn_dump_group_ptx, idx, device_id_, "source_ptx.ptx", dump_str); } } @@ -145,16 +159,21 @@ void CompilationInfoDumper::DumpInstruction() { } else { dump_str = "[No instruction generated]\n\n" + info_.Message(idx); } - Dump(FLAGS_cinn_dump_group_instruction, idx, "instruction.txt", dump_str); + Dump(FLAGS_cinn_dump_group_instruction, + idx, + device_id_, + "instruction.txt", + dump_str); } } void CompilationInfoDumper::Dump(const std::string& base_path, const int idx, + const int device_id, const std::string& file_name, const std::string& content) { - auto dump_path = - utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx); + auto dump_path = utils::StringFormat( + "%s/device_%d/fusion_group_%d", base_path.c_str(), device_id, idx); if (!hlir::framework::MakeDirectory( dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { LOG(WARNING) << "Failed to make directory: \"" << dump_path diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h index 8b09573b522e4..a468193d4d85a 100644 --- a/paddle/cinn/backends/compiler.h +++ b/paddle/cinn/backends/compiler.h @@ -43,8 +43,9 @@ namespace backends { */ class CompilationInfoDumper { public: - explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info) - : info_(info) { + explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info, + const int device_id) + : info_(info), device_id_(device_id) { DumpLoweredFunc(); DumpSourceCode(); DumpPtxCode(); @@ -52,14 +53,18 @@ class CompilationInfoDumper { } static void DumpLoweredFuncByGroupIndex(const ir::LoweredFunc& lowered_func, - const int gidx); + const int gidx, + const int device_id); static void DumpSourceCodeByGroupIndex(const std::string& source_code, - const int gidx); + const int gidx, + const int device_id); static void DumpPtxCodeByGroupIndex(const std::string& source_ptx, - const int gidx); + const int gidx, + const int device_id); static void DumpInstructionByGroupIndex( const std::unique_ptr& instr, - const int gidx); + const int gidx, + const int device_id); private: void DumpLoweredFunc(); @@ -68,10 +73,12 @@ class CompilationInfoDumper { void DumpInstruction(); static void Dump(const std::string& base_path, const int idx, + const int device_id, const std::string& file_name, const std::string& content); const hlir::framework::CompilationResult& info_; + const int device_id_; }; class SourceCodePrint { diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc index 3f81b8b91906d..4c8d166e4cc4a 100644 --- a/paddle/cinn/hlir/framework/graph.cc +++ b/paddle/cinn/hlir/framework/graph.cc @@ -18,6 +18,9 @@ #include #include "paddle/cinn/hlir/framework/visualize_helper.h" +#ifdef CINN_WITH_CUDA +#include "paddle/cinn/runtime/cuda/cuda_util.h" +#endif #include "paddle/cinn/runtime/flags.h" #include "paddle/cinn/utils/string.h" @@ -315,9 +318,14 @@ void Graph::VisualizeGroupedGraph( const auto& group_dots = VisualizeGroups(groups, fetch_var_ids); for (int idx = 0; idx < groups.size(); ++idx) { // Create fusion_group_x folder + int device_id = 0; +#ifdef CINN_WITH_CUDA + cudaGetDevice(&device_id); +#endif auto group_path = - utils::StringFormat("%s/fusion_group_%d", + utils::StringFormat("%s/device_%d/fusion_group_%d", FLAGS_cinn_fusion_groups_graphviz_dir.c_str(), + device_id, idx); if (!MakeDirectory(group_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc index bae6048477623..3a15f7c42bef0 100644 --- a/paddle/cinn/hlir/framework/parallel_compiler.cc +++ b/paddle/cinn/hlir/framework/parallel_compiler.cc @@ -80,8 +80,13 @@ void ParallelCompiler::SplitTask() { CHECK(context_->lowered_funcs.empty() || context_->graph->fusion_groups.size() == context_->lowered_funcs.size()); - for (int i = 0; i < context_->graph->fusion_groups.size(); ++i) { - tasks_.emplace_back(i, this, context_); + int device_id = 0; +#ifdef CINN_WITH_CUDA + CUDA_CALL(cudaGetDevice(&device_id)); +#endif + for (int group_id = 0; group_id < context_->graph->fusion_groups.size(); + ++group_id) { + tasks_.emplace_back(device_id, group_id, this, context_); } } @@ -126,11 +131,20 @@ void ParallelCompiler::RunTask() { } void ParallelCompiler::LaunchTask() { + int device_id = 0; +#ifdef CINN_WITH_CUDA + CUDA_CALL(cudaGetDevice(&device_id)); +#endif + int num_threads = FLAGS_cinn_parallel_compile_thread; +#if defined(PADDLE_WITH_DISTRIBUTE) + if (device_id > 0) { + num_threads = 1; + } +#endif // multi thread compilation std::vector threads; - VLOG(4) << "Compile with " << FLAGS_cinn_parallel_compile_thread - << " threads"; - for (int idx = 1; idx < FLAGS_cinn_parallel_compile_thread; ++idx) { + VLOG(4) << "Compile with " << num_threads << " threads"; + for (int idx = 1; idx < num_threads; ++idx) { threads.emplace_back(&ParallelCompiler::RunTask, this); } @@ -208,7 +222,7 @@ void ParallelCompiler::Task::Lowering() { pcompiler->result_.SetLoweredFuncs(group_id, lowered_funcs); } backends::CompilationInfoDumper::DumpLoweredFuncByGroupIndex( - pcompiler->result_.LoweredFuncs(group_id).front(), group_id); + pcompiler->result_.LoweredFuncs(group_id).front(), group_id, device_id); } void ParallelCompiler::Task::CodegenAndJit() { @@ -239,8 +253,8 @@ void ParallelCompiler::Task::CodegenAndJit() { } CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n" << dmodule; - backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex(cuda_c, - group_id); + backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex( + cuda_c, group_id, device_id); pcompiler->result_.SetSourceCode(group_id, cuda_c); cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c); @@ -249,7 +263,8 @@ void ParallelCompiler::Task::CodegenAndJit() { backends::nvrtc::Compiler compiler; auto ptx = compiler(cuda_c); CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; - backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex(ptx, group_id); + backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex( + ptx, group_id, device_id); pcompiler->result_.SetSourcePtx(group_id, ptx); // load cumodule cumodule = std::make_unique(ptx, @@ -260,7 +275,7 @@ void ParallelCompiler::Task::CodegenAndJit() { // register kernel backends::RuntimeSymbols symbols; for (auto& fn : dmodule.functions()) { - auto cufunc = cumodule->GetFunction(0, fn->name); + auto cufunc = cumodule->GetFunction(device_id, fn->name); CHECK(cufunc); symbols.RegisterVar(fn->name + "_ptr_", reinterpret_cast(cufunc)); } @@ -291,7 +306,8 @@ void ParallelCompiler::Task::BuildInstruction() { instr->SetLoweredFunc(reinterpret_cast(fn_ptr), group->GetFuncName()); instr->Finalize(); - backends::CompilationInfoDumper::DumpInstructionByGroupIndex(instr, group_id); + backends::CompilationInfoDumper::DumpInstructionByGroupIndex( + instr, group_id, device_id); pcompiler->result_.SetInstruction(group_id, std::move(instr)); } diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h index e78ee99404867..df0d39ebe2afc 100644 --- a/paddle/cinn/hlir/framework/parallel_compiler.h +++ b/paddle/cinn/hlir/framework/parallel_compiler.h @@ -36,8 +36,14 @@ namespace framework { class ParallelCompiler { public: struct Task { - Task(int group_id, ParallelCompiler* compiler, CompilationContext* context) - : group_id(group_id), pcompiler(compiler), context(context) {} + Task(int device_id, + int group_id, + ParallelCompiler* compiler, + CompilationContext* context) + : device_id(device_id), + group_id(group_id), + pcompiler(compiler), + context(context) {} void Lowering(); void CodegenAndJit(); void BuildInstruction(); @@ -48,6 +54,7 @@ class ParallelCompiler { CompilationStatus status = CompilationStatus::SUCCESS; std::string message; + const int device_id; int group_id; std::unique_ptr engine; From be463d319530ec7ae1b5d4d5ecb7f1d3d0dbb445 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:17:21 +0800 Subject: [PATCH 29/39] [PIR]add all add , mul newir optest (#57533) * add all add mul newir optest * add sub optest * delete sub --- test/legacy_test/test_elementwise_add_op.py | 21 +++-------- test/legacy_test/test_elementwise_mul_op.py | 42 ++++++--------------- 2 files changed, 17 insertions(+), 46 deletions(-) diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py index 279d1997f160e..8bacfc9a45cfd 100644 --- a/test/legacy_test/test_elementwise_add_op.py +++ b/test/legacy_test/test_elementwise_add_op.py @@ -212,7 +212,7 @@ def setUp(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(place, check_new_ir=True) def test_check_grad_normal(self): place = core.CUDAPlace(0) @@ -738,27 +738,16 @@ def init_input_output(self): self.out = self.x + self.y def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=False) def test_check_grad_normal(self): - self.check_grad( - ['X', 'Y'], - 'Out', - ) + self.check_grad(['X', 'Y'], 'Out', check_new_ir=False) def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], - 'Out', - no_grad_set=set("X"), - ) + self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False) def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], - 'Out', - no_grad_set=set('Y'), - ) + self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False) class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp): diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py index 86f4e764916e0..8013eb0baaf15 100644 --- a/test/legacy_test/test_elementwise_mul_op.py +++ b/test/legacy_test/test_elementwise_mul_op.py @@ -128,24 +128,13 @@ def if_enable_cinn(self): self.enable_cinn = False def test_check_grad_normal(self): - self.check_grad( - ['X', 'Y'], - 'Out', - ) + self.check_grad(['X', 'Y'], 'Out', check_new_ir=False) def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], - 'Out', - no_grad_set=set("X"), - ) + self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False) def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], - 'Out', - no_grad_set=set('Y'), - ) + self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False) class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp): @@ -196,7 +185,7 @@ def setUp(self): self.if_enable_cinn() def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True) @@ -274,6 +263,7 @@ def test_check_output(self): self.check_output( check_dygraph=self.check_dygraph, check_prim=self.check_prim, + check_new_ir=self.check_dygraph, ) def test_check_grad_normal(self): @@ -282,6 +272,7 @@ def test_check_grad_normal(self): 'Out', check_dygraph=self.check_dygraph, check_prim=self.check_prim, + check_new_ir=self.check_dygraph, ) def test_check_grad_ingore_x(self): @@ -291,6 +282,7 @@ def test_check_grad_ingore_x(self): no_grad_set=set("X"), check_dygraph=self.check_dygraph, check_prim=self.check_prim, + check_new_ir=self.check_dygraph, ) def test_check_grad_ingore_y(self): @@ -300,6 +292,7 @@ def test_check_grad_ingore_y(self): no_grad_set=set('Y'), check_dygraph=self.check_dygraph, check_prim=self.check_prim, + check_new_ir=self.check_dygraph, ) def init_input_attr_output(self): @@ -527,27 +520,16 @@ def init_input_output(self): self.out = self.x * self.y def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=False) def test_check_grad_normal(self): - self.check_grad( - ['X', 'Y'], - 'Out', - ) + self.check_grad(['X', 'Y'], 'Out', check_new_ir=False) def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], - 'Out', - no_grad_set=set("X"), - ) + self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False) def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], - 'Out', - no_grad_set=set('Y'), - ) + self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False) class TestRealComplexElementwiseMulOp(TestComplexElementwiseMulOp): From 6d9d73a230d65c871da4487c30a5c82558056833 Mon Sep 17 00:00:00 2001 From: Ruibin Cheung Date: Thu, 21 Sep 2023 14:35:35 +0800 Subject: [PATCH 30/39] [Custom Device] change the dlopen flag of custom device dylibs (#57544) --- paddle/fluid/platform/init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index eae360c146df5..a3fff528f7903 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -142,7 +142,7 @@ void LoadCustomDevice(const std::string &library_dir) { LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]"; std::vector libs = phi::ListAllLibraries(library_dir); for (const auto &lib_path : libs) { - auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW); + auto dso_handle = dlopen(lib_path.c_str(), RTLD_LAZY); PADDLE_ENFORCE_NOT_NULL( dso_handle, platform::errors::InvalidArgument( From c5a70065ac0baa817903595749e5b5e425bccc1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 21 Sep 2023 14:46:08 +0800 Subject: [PATCH 31/39] move ir_nodes_collector from namespace optim to ir_utils (#57535) --- paddle/cinn/ast_gen_ius/tensor_group.cc | 2 +- .../cinn/auto_schedule/analysis/analyze_ir.cc | 37 ++-- .../search_space/auto_gen_rule/auto_bind.cc | 4 +- .../search_space/auto_gen_rule/auto_inline.cc | 16 +- .../search_space/auto_gen_rule/auto_unroll.cc | 2 +- paddle/cinn/backends/codegen_cuda_dev.cc | 2 +- paddle/cinn/backends/llvm/codegen_x86.cc | 2 +- paddle/cinn/common/arithmatic.cc | 4 +- paddle/cinn/common/cas.cc | 2 +- paddle/cinn/common/ir_util.cc | 10 +- .../cinn/hlir/framework/op_lowering_util.cc | 7 +- paddle/cinn/hlir/pe/ir_schedule_pe.cc | 6 +- paddle/cinn/ir/ir.cc | 2 +- paddle/cinn/ir/lowered_func.cc | 14 +- paddle/cinn/ir/schedule/ir_schedule.cc | 61 ++++--- paddle/cinn/ir/schedule/ir_schedule_util.cc | 172 +++++++++--------- paddle/cinn/ir/tensor.cc | 6 +- paddle/cinn/ir/test/collect_ir_nodes_test.cc | 3 +- paddle/cinn/ir/utils/ir_nodes_collector.cc | 4 +- paddle/cinn/ir/utils/ir_nodes_collector.h | 4 +- paddle/cinn/lang/lower.cc | 56 +++--- paddle/cinn/lang/lower_impl.cc | 19 +- paddle/cinn/lang/lower_tensor_group.cc | 4 +- paddle/cinn/optim/buffer_assign.cc | 2 +- paddle/cinn/optim/compute_inline_expand.cc | 9 +- .../optim/eliminate_broadcast_in_forloop.cc | 8 +- paddle/cinn/optim/transform_gpu_forloop.cc | 2 +- paddle/cinn/optim/vectorize_loops.cc | 8 +- paddle/cinn/poly/domain.cc | 4 +- paddle/cinn/poly/stage.cc | 15 +- 30 files changed, 253 insertions(+), 234 deletions(-) diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc index 2b604f2c383cb..e8b9c6a345e72 100644 --- a/paddle/cinn/ast_gen_ius/tensor_group.cc +++ b/paddle/cinn/ast_gen_ius/tensor_group.cc @@ -30,7 +30,7 @@ TensorGroup::TensorGroup(const std::vector& tensors) { for (auto& tensor : tensors) { output_tensor_names_.insert(tensor->name); - std::set used_tensors = ir::CollectIRNodes( + std::set used_tensors = ir::ir_utils::CollectIRNodes( tensor->body(), [](const Expr* x) { return x->as_tensor(); }); for (const Expr& x : used_tensors) { const ir::Tensor to_dep = x.as_tensor_ref(); diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc index 17aad495b246a..da2c063d9c00d 100644 --- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc +++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc @@ -54,29 +54,30 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) { return; } - ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) { - const ir::Load* load_expr = x->As(); - if (load_expr != nullptr) { - const ir::Tensor t = load_expr->tensor.as_tensor_ref(); - sche_block->read_buffers.emplace_back( - ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices))); - return false; - } - const ir::Store* store_expr = x->As(); - if (store_expr != nullptr) { - const ir::Tensor t = store_expr->tensor.as_tensor_ref(); - sche_block->write_buffers.emplace_back( - ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices))); - return false; - } - return false; - }); + ir::ir_utils::CollectIRNodesWithoutTensor( + sche_block->body, [&](const Expr* x) { + const ir::Load* load_expr = x->As(); + if (load_expr != nullptr) { + const ir::Tensor t = load_expr->tensor.as_tensor_ref(); + sche_block->read_buffers.emplace_back( + ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices))); + return false; + } + const ir::Store* store_expr = x->As(); + if (store_expr != nullptr) { + const ir::Tensor t = store_expr->tensor.as_tensor_ref(); + sche_block->write_buffers.emplace_back( + ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices))); + return false; + } + return false; + }); } bool ContainsNodeType(ir::Expr expr, const std::unordered_set& node_types) { std::set collection = - ir::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) { return node_types.find(x->node_type()) != node_types.end(); }); return !collection.empty(); diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc index 06215d98d8b27..62c92c9e38fca 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc @@ -31,7 +31,7 @@ bool IsSpatialLoop(const ir::For* for_node) { const auto& loop_var = for_node->loop_var; // collect cases where the loop_var used in one of reduce axis in underneath // ScheduleBlock - auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor( + auto used_for_reduce_axis = ir::ir_utils::CollectIRNodesWithoutTensor( for_node->body, [&loop_var](const Expr* x) { const auto* block_realize = x->As(); if (!block_realize) return false; @@ -46,7 +46,7 @@ bool IsSpatialLoop(const ir::For* for_node) { const ir::Expr& binding = block_realize->iter_values[i]; if (iter_var->is_reduce_axis || iter_var->name.substr(0, 6) == "reduce") { - auto used_exprs = ir::CollectIRNodesWithoutTensor( + auto used_exprs = ir::ir_utils::CollectIRNodesWithoutTensor( binding, [&loop_var](const Expr* x) { const ir::_Var_* var = x->As(); if (var && diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc index 946947611f35d..16eca6d677b89 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc @@ -49,7 +49,7 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr, ir::Expr root = ir_sch->GetRootBlock(sche_block_realize_expr); // Check the schedule block to be inlined is not a reduce tensor. - std::set find_store = ir::CollectIRNodesWithoutTensor( + std::set find_store = ir::ir_utils::CollectIRNodesWithoutTensor( compute_body, [&](const Expr* x) { return x->As(); }); if (find_store.size() != 1UL) { return false; @@ -76,17 +76,19 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr, } // Check this schedule block is the only writer of the tensor. - find_store = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { - return x->As() && - (x->As()->tensor).as_tensor_ref()->name == tensor->name; - }); + find_store = + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + return x->As() && + (x->As()->tensor).as_tensor_ref()->name == + tensor->name; + }); if (find_store.size() != 1UL) { return false; } // Check there is no overlap between the buffers the schedule block reads and // writes. - std::set find_load = - ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { + std::set find_load = ir::ir_utils::CollectIRNodesWithoutTensor( + compute_body, [&](const Expr* x) { return x->As() && x->As()->tensor == tensor_expr; }); if (!find_load.empty()) { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc index 946bd9e9d7730..000203306c1a1 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc @@ -56,7 +56,7 @@ bool AutoUnroll::MeetCondition(const ir::ScheduleBlock* schedule_block) const { return false; }; - auto find_target_exprs = ir::CollectIRNodesWithoutTensor( + auto find_target_exprs = ir::ir_utils::CollectIRNodesWithoutTensor( schedule_block->body, [&has_reduce_iter, &has_nonserial_loop](const Expr* x) { return has_reduce_iter(x) || has_nonserial_loop(x); diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc index e33154f0c0129..1f6f5bba154aa 100644 --- a/paddle/cinn/backends/codegen_cuda_dev.cc +++ b/paddle/cinn/backends/codegen_cuda_dev.cc @@ -90,7 +90,7 @@ std::vector CodeGenCUDA_Dev::GenerateBufferAliasExprs( temp_buffers.end()); // prepare temp buffer alias std::vector buffer_alias; - auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) { + auto tensors = ir::ir_utils::CollectIRNodes(op->body, [&](const Expr *x) { return x->as_tensor() && x->as_tensor()->buffer.defined() && temp_buffer_set.count(x->as_tensor()->buffer); }); diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc index ccae02ac5746b..9de0603e2c9e2 100644 --- a/paddle/cinn/backends/llvm/codegen_x86.cc +++ b/paddle/cinn/backends/llvm/codegen_x86.cc @@ -98,7 +98,7 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) { llvm::Function::PrivateLinkage, "__parallel_lambda", m_); - std::vector vars = ir::CollectUndefinedVars(&body); + std::vector vars = ir::ir_utils::CollectUndefinedVars(&body); uint64_t nbytes; auto* data = PackVars(vars, &nbytes); diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc index 16b1d9cb8e8a5..af6656317aa11 100644 --- a/paddle/cinn/common/arithmatic.cc +++ b/paddle/cinn/common/arithmatic.cc @@ -126,7 +126,7 @@ GiNaC::ex ExprToGinacConverter::BuildHelper(ir::Expr expr) { GiNaC::ex ExprToGinacConverter::operator()(Expr expr) { // TODO(Superjomn) Replace this with common::IsPureMath( - auto complex_nodes = CollectIRNodes(expr, [](const Expr* n) { + auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [](const Expr* n) { return n->As() || // n->As() || // n->As() || // @@ -262,7 +262,7 @@ bool IsPureMath(Expr expr) { IrNodeTy ::Minus, }); - auto complex_nodes = ir::CollectIRNodes(expr, [&](const Expr* n) { + auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [&](const Expr* n) { return !valid_node_tys.count(n->node_type()); }); #ifdef CINN_DEBUG diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc index 6264c5b12d453..bf1c9092ed5eb 100644 --- a/paddle/cinn/common/cas.cc +++ b/paddle/cinn/common/cas.cc @@ -1868,7 +1868,7 @@ bool IsExprCasCompatible(Expr expr) { return expr->As() || expr->As() || expr->As() || expr->As
(); }; - return ir::CollectIRNodes(expr, teller).empty(); + return ir::ir_utils::CollectIRNodes(expr, teller).empty(); } // Partially divide a by b. e.g. (2x+y)/2 => x + y/2 diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc index f0f219ee105f7..4f000af1e8f0d 100644 --- a/paddle/cinn/common/ir_util.cc +++ b/paddle/cinn/common/ir_util.cc @@ -249,8 +249,8 @@ Expr or_all(const std::vector &conds) { } void CheckTensorUniqueInExpr(Expr expr) { - auto tensor_uniq = - ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); }); + auto tensor_uniq = ir::ir_utils::CollectIRNodes( + expr, [](const Expr *x) { return x->as_tensor(); }); absl::flat_hash_map tensor_names; for (auto &t : tensor_uniq) { auto *tp = t.as_tensor(); @@ -269,9 +269,9 @@ void CheckBufferUniqueInExpr(Expr expr) { // the buffers exists in tensor and lowered functions. CheckTensorUniqueInExpr(expr); - auto tensors = - ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); }); - auto funcs = ir::CollectIRNodes( + auto tensors = ir::ir_utils::CollectIRNodes( + expr, [](const Expr *x) { return x->as_tensor(); }); + auto funcs = ir::ir_utils::CollectIRNodes( expr, [](const Expr *x) { return x->as_lowered_func(); }); absl::flat_hash_map buffer_name; diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc index e7a4412202d87..1af9ef0576351 100644 --- a/paddle/cinn/hlir/framework/op_lowering_util.cc +++ b/paddle/cinn/hlir/framework/op_lowering_util.cc @@ -1046,7 +1046,7 @@ void LoopAssignReduce( auto first_reduce_loop = rloops.front(); // collect if auto if_checker = [](const Expr* x) { return x->As(); }; - auto if_set = ir::CollectIRNodesWithoutTensor( + auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor( first_reduce_loop.As()->body, if_checker); std::string reduce_block_name = reducer_data->id(); for (auto if_expr : if_set) { @@ -1056,10 +1056,11 @@ void LoopAssignReduce( ->schedule_block.As() ->name == reduce_block_name; }; - auto blocks_in_if = ir::CollectIRNodesWithoutTensor(if_expr, checker); + auto blocks_in_if = + ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker); if (!blocks_in_if.empty()) { ir::Expr condition = if_expr.As()->condition; - auto indices_in_if = ir::CollectIRNodesWithoutTensor( + auto indices_in_if = ir::ir_utils::CollectIRNodesWithoutTensor( condition, [](const Expr* x) { return x->As(); }); for (int i = 0; i < rloops.size(); ++i) { std::string var_name = rloops[i].As()->loop_var->name; diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc index 3677025aaedaa..6600905b083c1 100644 --- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc +++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc @@ -633,7 +633,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch, // NOLINT // simplify reshape index auto hand_write_simplify = [](std::vector loops, ir::Expr block) { // check exist select. - auto find_select = ir::CollectIRNodesInOrder( + auto find_select = ir::ir_utils::CollectIRNodesInOrder( block, [&](const Expr *x) { return x->As(); }); if (find_select.size() > 0) { return; @@ -667,7 +667,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch, // NOLINT index = index + ir::Expr(schedule_block->iter_vars[idx]) * stride; } - auto exprs = ir::CollectIRNodesInOrder( + auto exprs = ir::ir_utils::CollectIRNodesInOrder( block, [&](const Expr *x) { return x->As(); }); CHECK_EQ(exprs.size(), 1); auto load = exprs.front().As(); @@ -709,7 +709,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch, // NOLINT break; } - auto exprs = ir::CollectIRNodesInOrder( + auto exprs = ir::ir_utils::CollectIRNodesInOrder( block, [&](const Expr *x) { return x->As(); }); for (auto expr : exprs) { auto load = expr.As(); diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc index 5427a14afa5ba..7911f3ea14bba 100644 --- a/paddle/cinn/ir/ir.cc +++ b/paddle/cinn/ir/ir.cc @@ -535,7 +535,7 @@ std::vector PolyFor::expr_fields() const { } Expr PolyFor::ExtractExtent() const { - auto nodes = CollectIRNodes(condition, [&](const Expr *e) { + auto nodes = ir::ir_utils::CollectIRNodes(condition, [&](const Expr *e) { return e->As() || // e->As() || // e->As() || // diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc index 5a897e7c334a5..ec5f4b2e64ce6 100644 --- a/paddle/cinn/ir/lowered_func.cc +++ b/paddle/cinn/ir/lowered_func.cc @@ -82,7 +82,7 @@ std::vector _LoweredFunc_::expr_fields() const { return {&body}; } void _LoweredFunc_::PrepareCudaAxisInfoFromBody() { std::set bound_for_exprs = - ir::CollectIRNodes(body, [](const Expr* expr) { + ir::ir_utils::CollectIRNodes(body, [](const Expr* expr) { const ir::For* for_expr = expr->As(); return for_expr != nullptr && for_expr->is_binded(); }); @@ -208,7 +208,7 @@ void _LoweredFunc_::AllocTempBuffer() {} void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) { buffer_data_cast_exprs.clear(); // collect write. - auto write_teller = ir::CollectTensorNeedsWrite(&body); + auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body); auto tensors = CollectAllTensorReference(with_expr_gen_tensor); std::sort(tensors.begin(), @@ -248,7 +248,7 @@ std::vector _LoweredFunc_::CudaAliasVarExprs() const { } // collect write. std::vector res; - auto write_teller = ir::CollectTensorNeedsWrite(&body); + auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body); auto tensors = CollectAllTensorReference(); std::sort(tensors.begin(), @@ -403,11 +403,11 @@ std::vector _LoweredFunc_::CollectAllTensorReference( bool with_expr_gen_tensor) const { std::set tensor_exprs = with_expr_gen_tensor - ? ir::CollectIRNodes( + ? ir::ir_utils::CollectIRNodes( body, [](const Expr* expr) { return expr->As(); }) - : ir::CollectIRNodesWithoutTensor(body, [](const Expr* expr) { - return expr->As(); - }); + : ir::ir_utils::CollectIRNodesWithoutTensor( + body, + [](const Expr* expr) { return expr->As(); }); std::vector tensors; // remove the duplicate tensor by their name. diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc index 78ce98564dbdc..fab8a53deb121 100644 --- a/paddle/cinn/ir/schedule/ir_schedule.cc +++ b/paddle/cinn/ir/schedule/ir_schedule.cc @@ -767,7 +767,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> { rewriter(&info->cache_block); rewriter.mutate_cache_block = false; rewriter(&new_root); - auto find_tensor = ir::CollectIRNodesWithoutTensor( + auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor( new_root, [&](const Expr* x) { return x->As() && @@ -775,7 +775,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> { }, true); if (!find_tensor.empty()) { - auto find_store = ir::CollectIRNodesWithoutTensor( + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( (*find_tensor.begin()), [&](const Expr* x) { return x->As() && (x->As()->tensor == Expr(info->write_tensor)); @@ -864,7 +864,7 @@ struct ChangeBodyToBlock : public ir::IRMutator<> { DeviceAPI ScheduleImpl::GetDeviceAPI() const { auto exprs = this->GetModule().GetExprs(); - auto find_for_nodes = ir::CollectIRNodesWithoutTensor( + auto find_for_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( exprs.front(), [&](const Expr* x) { return x->As(); }, true); CHECK(!find_for_nodes.empty()); return (*find_for_nodes.begin()).As()->device_api; @@ -925,7 +925,7 @@ Expr ScheduleImpl::CacheWrite(const Expr& block, ->schedule_block.As() ->body); - auto find_cache_block = ir::CollectIRNodesWithoutTensor( + auto find_cache_block = ir::ir_utils::CollectIRNodesWithoutTensor( root, [&](const Expr* x) { return x->As() && @@ -937,9 +937,10 @@ Expr ScheduleImpl::CacheWrite(const Expr& block, CHECK(info.write_tensor->buffer.defined()); // Replace buffer - auto all_tensors = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { - return x->as_tensor() && x->as_tensor()->buffer.defined(); - }); + auto all_tensors = + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + return x->as_tensor() && x->as_tensor()->buffer.defined(); + }); for (auto i : all_tensors) { if (i.as_tensor()->name != info.write_tensor->name && @@ -1119,7 +1120,7 @@ Expr ScheduleImpl::Reorder(const Expr& block, Expr ScheduleImpl::GetRootBlock(const Expr& expr) const { auto exprs = this->GetModule().GetExprs(); for (auto& it_expr : exprs) { - auto find_expr = ir::CollectIRNodesWithoutTensor( + auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor( it_expr, [&](const Expr* x) { return x->node_type() == expr.node_type() && *x == expr; @@ -1198,20 +1199,21 @@ struct LoopReconstructor : public ir::IRMutator<> { // Replace the copied Tensor object with the original Tensor object, // to ensure that the same Tensor in a AST is the same object. std::unordered_map tensors_map; - ir::CollectIRNodesWithoutTensor(loop_, [&tensors_map](const Expr* x) { - if (x->as_tensor()) { - tensors_map.insert({x->as_tensor()->name, *x}); - return true; - } - return false; - }); - auto find_store = ir::CollectIRNodesWithoutTensor( + ir::ir_utils::CollectIRNodesWithoutTensor( + loop_, [&tensors_map](const Expr* x) { + if (x->as_tensor()) { + tensors_map.insert({x->as_tensor()->name, *x}); + return true; + } + return false; + }); + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( new_loop_, [](const Expr* x) { return x->As(); }); for (auto store : find_store) { store.As()->tensor = tensors_map.at(store.As()->tensor.as_tensor()->name); } - auto find_load = ir::CollectIRNodesWithoutTensor( + auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor( new_loop_, [](const Expr* x) { return x->As(); }); for (auto load : find_load) { load.As()->tensor = @@ -1275,7 +1277,7 @@ void ScheduleImpl::SetBuffer(Expr& block, const std::string& memory_type, bool fixed) { CHECK(block.As()); - auto find_tensor = ir::CollectIRNodesWithoutTensor( + auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor( block, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)"; @@ -1286,7 +1288,7 @@ void ScheduleImpl::SetBuffer(Expr& block, auto exprs = this->GetModule().GetExprs(); for (auto& it_expr : exprs) { auto find_tensor = - ir::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) { return x->as_tensor() && (x->as_tensor()->name == tensor.as_tensor_ref()->name || x->as_tensor()->name == @@ -1328,7 +1330,7 @@ void ScheduleImpl::MergeExprs() { ->body); VLOG(3) << "Before merging, exprs[0] is : " << exprs[0]; for (int i = 1; i < exprs.size(); ++i) { - auto root_block = ir::CollectIRNodesWithoutTensor( + auto root_block = ir::ir_utils::CollectIRNodesWithoutTensor( exprs[i], [&](const Expr* x) { return x->As() && @@ -1437,7 +1439,7 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) { auto body = block_loops.at(loops.size() - 1).As()->body; // collect if auto if_checker = [](const Expr* x) { return x->As(); }; - auto if_set = ir::CollectIRNodesWithoutTensor(body, if_checker); + auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor(body, if_checker); for (auto if_expr : if_set) { auto checker = [block_name](const Expr* x) { return x->As() && @@ -1445,7 +1447,8 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) { ->schedule_block.As() ->name == block_name; }; - if (ir::CollectIRNodesWithoutTensor(if_expr, checker, true).size() > 0) { + if (ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker, true) + .size() > 0) { result = IfThenElse::Make(if_expr.As()->condition, result); break; @@ -1582,7 +1585,7 @@ bool ComputeInliner::BodyPatternAllowInline() { return false; } CHECK(inlined_store_.As()); - auto find_vars = ir::CollectIRNodesWithoutTensor( + auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor( inlined_store_, [&](const Expr* x) { return x->as_var(); }); std::set vars_set; for (auto& i : find_vars) vars_set.insert(i.as_var_ref()); @@ -1650,7 +1653,7 @@ bool ReverseComputeInliner::BodyPatternAllowInline() { CHECK(inlined_store_.As()); CHECK(inlined_load_.As()); CHECK(target_store_.As()); - auto find_vars = ir::CollectIRNodesWithoutTensor( + auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor( inlined_store_, [&](const Expr* x) { return x->as_var(); }); std::set vars_set; for (auto& i : find_vars) vars_set.insert(i.as_var_ref()); @@ -2036,7 +2039,7 @@ void ScheduleImpl::FlattenLoops(const std::vector& loops, } } - auto exprs = ir::CollectIRNodesInOrder( + auto exprs = ir::ir_utils::CollectIRNodesInOrder( schedule_block->body, [&](const Expr* x) { return x->As() || x->As(); }); // reverse exprs from last to first. @@ -2185,7 +2188,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, std::set used_target_loop_vars; for (auto& iter_val : new_iter_values) { auto find_partial_loop = - ir::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) { if (x->as_var()) used_target_loop_vars.insert(x->as_var_ref()->name); return x->as_var(); }); @@ -2194,7 +2197,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, std::vector used_target_loops; auto expr_copy = optim::IRCopy(expr); for (auto& var : used_target_loop_vars) { - auto find_loop_var = ir::CollectIRNodesWithoutTensor( + auto find_loop_var = ir::ir_utils::CollectIRNodesWithoutTensor( expr_copy, [&](const Expr* x) { return x->As() && x->As()->loop_var->name == var && @@ -2222,7 +2225,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, } else { CHECK(old_iter_values[changed_loop_num].as_var()); auto old_var = old_iter_values[changed_loop_num].as_var_ref(); - auto find_partial_loop = ir::CollectIRNodesWithoutTensor( + auto find_partial_loop = ir::ir_utils::CollectIRNodesWithoutTensor( expr, [&](const Expr* x) { return x->As() && @@ -2232,7 +2235,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, true); CHECK_EQ(find_partial_loop.size(), 1U); new_loop = optim::IRCopy(*find_partial_loop.begin()); - auto find_schedule_block = ir::CollectIRNodesWithoutTensor( + auto find_schedule_block = ir::ir_utils::CollectIRNodesWithoutTensor( new_loop, [&](const Expr* x) { return x->As(); }, true); diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index b4000ff212cad..45779788e9c54 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -40,7 +40,7 @@ namespace ir { Tensor GetTensor(const Expr& block) { CHECK(block.As()); - auto find_tensor = ir::CollectIRNodesWithoutTensor( + auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor( block, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)"; @@ -52,13 +52,13 @@ Tensor GetTensor(const Expr& block) { Tensor GetReadTensor(const Expr& block, int index) { CHECK(block.As()); - auto find_tensor = ir::CollectIRNodesWithoutTensor( + auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor( block, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)"; std::vector res; auto find_read_tensor = - ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(block, [&](const Expr* x) { if (x->As()) res.push_back(x->As()->tensor.as_tensor_ref()); return x->As(); @@ -86,41 +86,43 @@ void SetCudaAxisInfo(Expr* lowered_func) { auto func_body = lowered_func->as_lowered_func_ref()->body; CudaAxisInfo info; - auto block_nodes = ir::CollectIRNodes(func_body, [&](const Expr* x) { - if (x->As() && x->As()->bind_info().valid()) { - auto bind_info = x->As()->bind_info(); - info.set_valid(true); - if (bind_info.for_type == ForType::GPUThread) { - CHECK(common::is_zero(x->As()->min)); - CHECK(x->As()->extent.is_constant()); - int range = x->As()->extent.get_constant(); - range = range > info.block_dim(bind_info.offset) - ? range - : info.block_dim(bind_info.offset); - VLOG(3) << "Set block dim[" << bind_info.offset << "] with range " - << range; - info.set_block_dim(bind_info.offset, range); - } else if (bind_info.for_type == ForType::GPUBlock) { - CHECK(common::is_zero(x->As()->min)); - CHECK(x->As()->extent.is_constant()); - int range = x->As()->extent.get_constant(); - range = range > info.grid_dim(bind_info.offset) - ? range - : info.grid_dim(bind_info.offset); - info.set_grid_dim(bind_info.offset, range); - VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range " - << range; - } else { - LOG(FATAL) << "The for loop's bind info should be gpu block or thread!"; - } - } - return (x->As() && x->As()->bind_info().valid()); - }); + auto block_nodes = + ir::ir_utils::CollectIRNodes(func_body, [&](const Expr* x) { + if (x->As() && x->As()->bind_info().valid()) { + auto bind_info = x->As()->bind_info(); + info.set_valid(true); + if (bind_info.for_type == ForType::GPUThread) { + CHECK(common::is_zero(x->As()->min)); + CHECK(x->As()->extent.is_constant()); + int range = x->As()->extent.get_constant(); + range = range > info.block_dim(bind_info.offset) + ? range + : info.block_dim(bind_info.offset); + VLOG(3) << "Set block dim[" << bind_info.offset << "] with range " + << range; + info.set_block_dim(bind_info.offset, range); + } else if (bind_info.for_type == ForType::GPUBlock) { + CHECK(common::is_zero(x->As()->min)); + CHECK(x->As()->extent.is_constant()); + int range = x->As()->extent.get_constant(); + range = range > info.grid_dim(bind_info.offset) + ? range + : info.grid_dim(bind_info.offset); + info.set_grid_dim(bind_info.offset, range); + VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range " + << range; + } else { + LOG(FATAL) + << "The for loop's bind info should be gpu block or thread!"; + } + } + return (x->As() && x->As()->bind_info().valid()); + }); lowered_func->as_lowered_func_ref()->cuda_axis_info = info; } bool Contains(const Expr& container, const Expr& expr) { - auto find_expr = ir::CollectIRNodesWithoutTensor( + auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor( container, [&](const Expr* x) { return (x->node_type() == expr.node_type() && *x == expr); @@ -283,13 +285,13 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) { auto* rf_for = rf_loop.As(); CHECK(rf_for) << "Expr param of Rfactor must be For node! Please check."; // check the rf_loop only has one schedule block - auto block_nodes = ir::CollectIRNodesWithoutTensor( + auto block_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( rf_loop, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(block_nodes.size(), 1U) << "Rfactor Loop should only have one schedule block"; - auto find_store = ir::CollectIRNodesWithoutTensor( + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( rf_loop, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_store.size(), 1U); auto indice = find_store.begin()->As()->indices; @@ -322,9 +324,9 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) { } std::vector GetLoopsOfExpr(const Expr& expr, const Expr& root) { - auto loop_nodes = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { - return x->As() && Contains(*x, expr); - }); + auto loop_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( + root, + [&](const Expr* x) { return x->As() && Contains(*x, expr); }); std::vector result(loop_nodes.begin(), loop_nodes.end()); if (result.empty()) LOG(FATAL) << "Didn't find expr's : \n" @@ -439,8 +441,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) { ->body; if (is_write) { std::vector find_store_vec; - auto find_store = - ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( + compute_body, [&](const Expr* x) { if (x->As()) find_store_vec.push_back(*x); return x->As(); }); @@ -450,8 +452,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) { return store_index; } else { std::vector find_load_vec; - auto find_load = - ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { + auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor( + compute_body, [&](const Expr* x) { if (x->As()) find_load_vec.push_back(*x); return x->As(); }); @@ -526,7 +528,7 @@ void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write) { Expr find_tensor = is_write ? Expr(info->write_tensor) : Expr(info->read_tensor); auto find_produce_read = - ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { return x->As() && x->As()->tensor == find_tensor; }); @@ -675,9 +677,9 @@ Expr ConstructNewLoopChain(const std::vector& chain, // In each IfThenElse node, find the vars its condition depends on. for (auto& if_expr : if_nodes) { CHECK(if_expr.As()); - auto var_set = - ir::CollectIRNodes(if_expr.As()->condition, - [&](const Expr* x) { return x->as_var(); }); + auto var_set = ir::ir_utils::CollectIRNodes( + if_expr.As()->condition, + [&](const Expr* x) { return x->as_var(); }); std::set var_name_set; for (auto& i : var_set) var_name_set.insert(i.as_var()->name); condition_vars.push_back(var_name_set); @@ -863,7 +865,7 @@ std::vector GetProducers(const Expr& block, const Expr& root) { std::string block_name = block.As() ->schedule_block.As() ->name; - ir::CollectIRNodesWithoutTensor( + ir::ir_utils::CollectIRNodesWithoutTensor( compute_body, [&producer_tensor_names, &block_name](const Expr* x) { auto* load = x->As(); if (load) { @@ -879,15 +881,15 @@ std::vector GetProducers(const Expr& block, const Expr& root) { // traverse each of other blocks and filter those ones which contain at least // one producer tensor; - auto find_blocks = - ir::CollectIRNodesWithoutTensor(root, [&block, &root](const Expr* x) { + auto find_blocks = ir::ir_utils::CollectIRNodesWithoutTensor( + root, [&block, &root](const Expr* x) { return x->As() && *x != block && *x != root; }); for (auto&& cur : find_blocks) { auto* cur_block = cur.As() ->schedule_block.As(); CHECK(cur_block) << "block result should be a ScheduleBlockRealize"; - auto find_stores = ir::CollectIRNodesWithoutTensor( + auto find_stores = ir::ir_utils::CollectIRNodesWithoutTensor( cur_block->body, [&producer_tensor_names](const Expr* x) { return x->As() && producer_tensor_names.count( @@ -905,27 +907,29 @@ std::vector GetConsumers(const Expr& block, const Expr& root) { std::string block_tensor = GetTensor(block)->name; if (IsReduceInitTensorName(block_tensor)) { std::string consumer_name = GetOriginalReduceTensorName(block_tensor); - auto consumer = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { - return x->As() && - x->As() - ->schedule_block.As() - ->name == consumer_name; - }); + auto consumer = + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + return x->As() && + x->As() + ->schedule_block.As() + ->name == consumer_name; + }); CHECK_EQ(consumer.size(), 1); return {*consumer.begin()}; } - auto find_block = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { - return x->As() && *x != block && *x != root; - }); + auto find_block = + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + return x->As() && *x != block && *x != root; + }); for (auto& i : find_block) { CHECK(i.As() ->schedule_block.As()); auto block_body = i.As() ->schedule_block.As() ->body; - auto find_load = - ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) { + auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor( + block_body, [&](const Expr* x) { return x->As() && x->As()->tensor.as_tensor_ref()->name == block_tensor; @@ -938,7 +942,7 @@ std::vector GetConsumers(const Expr& block, const Expr& root) { void CheckComputeAtValidation(const Expr& block, const Expr& loop, const Expr& root) { - auto find_block = ir::CollectIRNodesWithoutTensor( + auto find_block = ir::ir_utils::CollectIRNodesWithoutTensor( root, [&](const Expr* x) { return x->As() && *x == block; @@ -946,13 +950,13 @@ void CheckComputeAtValidation(const Expr& block, true); CHECK(!find_block.empty()) << "Didn't find block in root!"; - auto find_loop = ir::CollectIRNodesWithoutTensor( + auto find_loop = ir::ir_utils::CollectIRNodesWithoutTensor( root, [&](const Expr* x) { return x->As() && *x == loop; }, true); CHECK(!find_loop.empty()) << "Didn't find loop in root!"; - auto find_block_in_loop = ir::CollectIRNodesWithoutTensor( + auto find_block_in_loop = ir::ir_utils::CollectIRNodesWithoutTensor( loop, [&](const Expr* x) { return x->As() && *x == block; @@ -1005,10 +1009,10 @@ std::vector CalculateRequiredRegions( std::set provided_nodes; if (is_store_provided) { - provided_nodes = ir::CollectIRNodesWithoutTensor( + provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( block, [&](const Expr* x) { return x->As(); }); } else { - provided_nodes = ir::CollectIRNodesWithoutTensor( + provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( block, [&](const Expr* x) { return x->As(); }); } @@ -1036,7 +1040,7 @@ std::vector CalculateRequiredRegions( // Notice that we look for For nodes in loop's body instead of loop // itself. - auto find_loops = ir::CollectIRNodesWithoutTensor( + auto find_loops = ir::ir_utils::CollectIRNodesWithoutTensor( loop.As()->body, [&](const Expr* x) { return x->As() && Contains(*x, req_block); }); @@ -1052,15 +1056,15 @@ std::vector CalculateRequiredRegions( std::set required_nodes; if (is_store_provided) { - required_nodes = - ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) { + required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( + block_body, [&](const Expr* x) { return x->As() && x->As()->tensor.as_tensor_ref()->name == provided_tensor_name; }); } else { - required_nodes = - ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) { + required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( + block_body, [&](const Expr* x) { return x->As() && x->As()->tensor.as_tensor_ref()->name == provided_tensor_name; @@ -1105,7 +1109,7 @@ std::vector CalculateRequiredRegions( block.As()->iter_values[i].is_constant()); if (block.As()->iter_values[i].as_var()) { auto find_for_loops = - ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { return x->As() && x->As()->loop_var->name == block.As() @@ -1134,13 +1138,13 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block, ->schedule_block.As() ->body; // 1. Check the schedule block to be inlined is not a reduce tensor. - auto find_store = ir::CollectIRNodesWithoutTensor( + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( compute_body, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_store.size(), 1U); Expr tensor = (*find_store.begin()).As()->tensor; CHECK(!tensor.as_tensor_ref()->is_reduce_tensor()); // 2. Check this schedule block is the only writer of the tensor. - find_store = ir::CollectIRNodesWithoutTensor( + find_store = ir::ir_utils::CollectIRNodesWithoutTensor( root, [&](const Expr* x) { return x->As() && @@ -1151,8 +1155,8 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block, CHECK_EQ(find_store.size(), 1U); // 3. Check there is no overlap between the buffers the schedule block reads // and writes. - auto find_load = - ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { + auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor( + compute_body, [&](const Expr* x) { return x->As() && x->As()->tensor == tensor; }); CHECK(find_load.empty()); @@ -1166,14 +1170,14 @@ std::tuple CheckReverseComputeInlineValidationAndGetExprs( ->schedule_block.As() ->body; // 1. Check the schedule block to be reverse inlined is not a reduce tensor. - auto find_inlined_load = ir::CollectIRNodesWithoutTensor( + auto find_inlined_load = ir::ir_utils::CollectIRNodesWithoutTensor( compute_body, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_inlined_load.size(), 1U); Expr tensor = (*find_inlined_load.begin()).As()->tensor; CHECK(!tensor.as_tensor_ref()->is_reduce_tensor()); auto inlined_load = *find_inlined_load.begin(); // 2. Check this schedule block is the only reader of the tensor. - auto find_load = ir::CollectIRNodesWithoutTensor( + auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor( root, [&](const Expr* x) { return x->As() && @@ -1184,20 +1188,20 @@ std::tuple CheckReverseComputeInlineValidationAndGetExprs( CHECK_EQ(find_load.size(), 1U); // 3. Check there is no overlap between the buffers the schedule block reads // and writes. - auto find_store = - ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { + auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor( + compute_body, [&](const Expr* x) { return x->As() && x->As()->tensor == tensor; }); CHECK(find_store.empty()); // 4. Get store that will be inlined. auto find_inlined_store = - ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { return x->As() && x->As()->tensor == tensor; }); CHECK_EQ(find_inlined_store.size(), 1U); auto inlined_store = *find_inlined_store.begin(); // 5. Get target store. - auto find_target_store = ir::CollectIRNodesWithoutTensor( + auto find_target_store = ir::ir_utils::CollectIRNodesWithoutTensor( compute_body, [&](const Expr* x) { return x->As(); }, true); CHECK_EQ(find_target_store.size(), 1U); auto target_store = *find_target_store.begin(); @@ -1206,7 +1210,7 @@ std::tuple CheckReverseComputeInlineValidationAndGetExprs( bool ContainVar(const std::vector& exprs, const std::string& var_name) { for (auto& expr : exprs) { - auto find_expr = ir::CollectIRNodesWithoutTensor( + auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor( expr, [&](const Expr* x) { return x->As<_Var_>() && x->As<_Var_>()->name == var_name; diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc index 3297b714630e1..8ad8b9878d4bc 100644 --- a/paddle/cinn/ir/tensor.cc +++ b/paddle/cinn/ir/tensor.cc @@ -60,7 +60,7 @@ std::set _Tensor_::GetDependTensorNames() const { std::set names; auto add_depend_tensors_from_expr = [&](Expr expr) { - auto tensors = CollectIRNodes(expr, [&](const Expr *x) { + auto tensors = ir::ir_utils::CollectIRNodes(expr, [&](const Expr *x) { return x->as_tensor() && x->as_tensor()->name != this->name; }); for (auto &e : tensors) { @@ -515,7 +515,7 @@ bool _Tensor_::IsDependOnStatement(absl::string_view statement) { std::set _Tensor_::DependingTensorNames() { std::set res; if (body().defined()) { - auto depend_tensors = ir::CollectIRNodes( + auto depend_tensors = ir::ir_utils::CollectIRNodes( body(), [](const Expr *x) -> bool { return x->as_tensor(); }); for (const auto &x : depend_tensors) { if (x.get() != this) { @@ -538,7 +538,7 @@ std::vector _Tensor_::axis_with_reduce() const { } bool _Tensor_::Uses(const Tensor &other) const { - auto loads = ir::CollectIRNodes(body(), [&](const Expr *x) { + auto loads = ir::ir_utils::CollectIRNodes(body(), [&](const Expr *x) { auto *loadn = x->As(); if (!loadn) return false; return loadn->tensor.as_tensor()->name == other->name; diff --git a/paddle/cinn/ir/test/collect_ir_nodes_test.cc b/paddle/cinn/ir/test/collect_ir_nodes_test.cc index 82441b4a005c7..d380b4475e37d 100644 --- a/paddle/cinn/ir/test/collect_ir_nodes_test.cc +++ b/paddle/cinn/ir/test/collect_ir_nodes_test.cc @@ -19,6 +19,7 @@ namespace cinn { namespace ir { +namespace ir_utils { TEST(CollectIRNodes, basic0) { Expr C = Expr(1) + 2; @@ -57,6 +58,6 @@ TEST(CollectIRNodes, basic) { CollectIRNodes(fn_body, [](const Expr* x) { return x->as_tensor(); }); auto exprs = CollectIRNodes(fn_body, [](const Expr* x) { return x; }); } - +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc index d44c3701b5ac2..7d7373a6b9ee8 100644 --- a/paddle/cinn/ir/utils/ir_nodes_collector.cc +++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc @@ -21,8 +21,8 @@ namespace cinn { namespace ir { +namespace ir_utils { namespace { - struct IrNodesCollector : public IRVisitorRequireReImpl { using teller_t = std::function; using handler_t = std::function; @@ -317,6 +317,6 @@ std::set CollectTensorNeedsWrite(const Expr* e) { collector.Visit(e); return tensor_written; } - +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.h b/paddle/cinn/ir/utils/ir_nodes_collector.h index 0f8a390e1ade7..7bfb1b3b4e6b3 100644 --- a/paddle/cinn/ir/utils/ir_nodes_collector.h +++ b/paddle/cinn/ir/utils/ir_nodes_collector.h @@ -18,7 +18,7 @@ namespace cinn { namespace ir { - +namespace ir_utils { /** * Collect the IR Nodes(without duplication) in the expression. */ @@ -83,6 +83,6 @@ std::vector CollectUndefinedVars(const Expr* e); * Collect the Tensor Nodes which will be Writed by Store or Call Nodes */ std::set CollectTensorNeedsWrite(const Expr* e); - +} // namespace ir_utils } // namespace ir } // namespace cinn diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc index 58ae00fe8771e..0b91b6d598ac7 100644 --- a/paddle/cinn/lang/lower.cc +++ b/paddle/cinn/lang/lower.cc @@ -40,7 +40,7 @@ std::vector GetArgs( std::vector res; std::map> name2loads; std::map> name2stores; - auto load_or_store_nodes = ir::CollectIRNodesWithoutTensor( + auto load_or_store_nodes = ir::ir_utils::CollectIRNodesWithoutTensor( func_body, [&](const Expr* x) { return x->As() || x->As(); }); @@ -102,7 +102,7 @@ std::vector GetTempBuffers(const std::vector& tensor_args, name_to_buffer; // used to avoid duplication. auto all_temp_tensors = - ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { return x->as_tensor() && x->as_tensor()->buffer.defined() && (!tensor_group.Contain(x->as_tensor()->name) && ((!buffer_arg_names.count(x->as_tensor()->buffer->name) && @@ -145,7 +145,7 @@ std::vector GetTempBuffers(const std::vector& tensor_args, name_to_buffer; // used to avoid duplication. auto all_temp_tensors = - ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { return x->as_tensor() && x->as_tensor()->buffer.defined() && (!stage_map->Lookup(x->as_tensor()->name) || !stage_map[x->as_tensor()]->inlined()) && @@ -165,17 +165,18 @@ std::vector GetTempBuffers(const std::vector& tensor_args, } } // visit the ir body and update the map of name_to_buffer - auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { - if (x->as_tensor() && x->as_tensor()->buffer.defined()) { - auto buffer_name = x->as_tensor()->buffer->name; - if (name_to_buffer.count(buffer_name) && - x->as_tensor()->buffer->numel() < - name_to_buffer[buffer_name]->numel()) { - name_to_buffer[buffer_name] = x->as_tensor()->buffer; - } - } - return x->as_tensor() && x->as_tensor()->buffer.defined(); - }); + auto update_map = + ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { + if (x->as_tensor() && x->as_tensor()->buffer.defined()) { + auto buffer_name = x->as_tensor()->buffer->name; + if (name_to_buffer.count(buffer_name) && + x->as_tensor()->buffer->numel() < + name_to_buffer[buffer_name]->numel()) { + name_to_buffer[buffer_name] = x->as_tensor()->buffer; + } + } + return x->as_tensor() && x->as_tensor()->buffer.defined(); + }); std::vector temp_buffers; for (auto& i : name_to_buffer) temp_buffers.push_back(i.second); @@ -195,7 +196,7 @@ std::vector GetTempBuffers(const std::vector& args, name_to_buffer; // used to avoid duplication. auto all_temp_tensors = - ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { + ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { return x->as_tensor() && x->as_tensor()->buffer.defined() && (!buffer_arg_names.count(x->as_tensor()->buffer->name) || utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer")); @@ -212,17 +213,18 @@ std::vector GetTempBuffers(const std::vector& args, } } // visit the ir body and update the map of name_to_buffer - auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { - if (x->as_tensor() && x->as_tensor()->buffer.defined()) { - auto buffer_name = x->as_tensor()->buffer->name; - if (name_to_buffer.count(buffer_name) && - x->as_tensor()->buffer->numel() < - name_to_buffer[buffer_name]->numel()) { - name_to_buffer[buffer_name] = x->as_tensor()->buffer; - } - } - return x->as_tensor() && x->as_tensor()->buffer.defined(); - }); + auto update_map = + ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { + if (x->as_tensor() && x->as_tensor()->buffer.defined()) { + auto buffer_name = x->as_tensor()->buffer->name; + if (name_to_buffer.count(buffer_name) && + x->as_tensor()->buffer->numel() < + name_to_buffer[buffer_name]->numel()) { + name_to_buffer[buffer_name] = x->as_tensor()->buffer; + } + } + return x->as_tensor() && x->as_tensor()->buffer.defined(); + }); std::vector temp_buffers; for (auto& i : name_to_buffer) temp_buffers.push_back(i.second); @@ -250,7 +252,7 @@ void InitReduceTensor(StageMap stages, tensor->InitReduction(stages, target); } auto uninited_reduce_tensors = - ir::CollectIRNodes(tensor->body(), [&](const Expr* x) { + ir::ir_utils::CollectIRNodes(tensor->body(), [&](const Expr* x) { return x && x->defined() && x->as_tensor() && x->as_tensor()->is_reduce_tensor() && !x->as_tensor()->IsReduceInited(stages); diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc index 629b405dcd2f0..24d5325bc1be9 100644 --- a/paddle/cinn/lang/lower_impl.cc +++ b/paddle/cinn/lang/lower_impl.cc @@ -35,7 +35,7 @@ namespace lang { namespace detail { void CheckNoIslCallRemains(Expr* expr) { - auto isl_calls = ir::CollectIRNodes(*expr, [](const Expr* expr) { + auto isl_calls = ir::ir_utils::CollectIRNodes(*expr, [](const Expr* expr) { return expr->As() && expr->As()->is_isl_call(); }); #ifdef CINN_DEBUG @@ -223,7 +223,7 @@ void CreateCompGraphWithInlineTensors(common::Graph* graph, // collect dependency tensors of t // here we just collect the tensors in Load nodes // NOTE there may be some other cases. - auto deps = ir::CollectLoadTensors( + auto deps = ir::ir_utils::CollectLoadTensors( t->body(), [](const Expr* x) { return x->as_tensor(); }); for (const auto& dep : deps) { auto e_tensor = dep.as_tensor_ref(); @@ -342,7 +342,7 @@ std::vector LowerImpl::GenerateFunctionArgumentList( CheckArgsUnique(); std::vector args; - auto teller = ir::CollectTensorNeedsWrite(&fn_body); + auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body); std::set arg_names; @@ -395,7 +395,7 @@ std::vector LowerImpl::GenFuncArgForSplitKernel( std::vector in_args; std::vector out_args; - auto teller = ir::CollectTensorNeedsWrite(&func_iterator); + auto teller = ir::ir_utils::CollectTensorNeedsWrite(&func_iterator); std::set arg_names; std::set all_tensor_names; @@ -408,11 +408,12 @@ std::vector LowerImpl::GenFuncArgForSplitKernel( in_args.emplace_back(scalar, ir::Argument::IO::kInput); } - auto all_tensors = ir::CollectIRNodes(func_iterator, [&](const Expr* x) { - return x->as_tensor() && !stages_[x->as_tensor()]->inlined(); - }); + auto all_tensors = + ir::ir_utils::CollectIRNodes(func_iterator, [&](const Expr* x) { + return x->as_tensor() && !stages_[x->as_tensor()]->inlined(); + }); - auto all_vars = ir::CollectIRNodes( + auto all_vars = ir::ir_utils::CollectIRNodes( func_iterator, [&](const Expr* x) { return x->as_var(); }); for (auto& i : all_tensors) { @@ -588,7 +589,7 @@ std::vector LowerImpl::operator()() { Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer; } } - auto store_exprs = ir::CollectIRNodes( + auto store_exprs = ir::ir_utils::CollectIRNodes( func_iterator, [](const Expr* x) { return x->As(); }); std::vector new_temp_tensors; for (auto& expr : store_exprs) { diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc index 200b608387560..0a802c0f0566d 100644 --- a/paddle/cinn/lang/lower_tensor_group.cc +++ b/paddle/cinn/lang/lower_tensor_group.cc @@ -88,7 +88,7 @@ std::vector LowerTensorGroup::operator()() { } // Some store tensors are also temp tensors; - auto store_exprs = ir::CollectIRNodes( + auto store_exprs = ir::ir_utils::CollectIRNodes( func_body, [](const Expr* x) { return x->As(); }); for (auto& expr : store_exprs) { auto* store_node = expr.As(); @@ -146,7 +146,7 @@ std::vector LowerTensorGroup::operator()() { std::vector LowerTensorGroup::GenerateFunctionArgumentList( Expr fn_body) { std::vector args; - auto teller = ir::CollectTensorNeedsWrite(&fn_body); + auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body); std::set arg_names; diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc index eb059a30ea26d..175689defbe36 100644 --- a/paddle/cinn/optim/buffer_assign.cc +++ b/paddle/cinn/optim/buffer_assign.cc @@ -73,7 +73,7 @@ std::map InitialAssignBuffer( // unify all the tensor occurance with a global one, e.g. there are multiple // tensor B exists in the expression, replace them with a shared one. - ir::CollectIRNodes(*expr, [&](const Expr* x) -> bool { + ir::ir_utils::CollectIRNodes(*expr, [&](const Expr* x) -> bool { auto* t = x->as_tensor(); if (t && !stages[t]->inlined()) { Reference(x) = Expr(all_tensor_map.at(t->name)); diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc index 8dad52ab4d9bc..d4123729bc53f 100644 --- a/paddle/cinn/optim/compute_inline_expand.cc +++ b/paddle/cinn/optim/compute_inline_expand.cc @@ -225,7 +225,7 @@ void ComputeInlineExpand(Expr *expr, poly::StageMap stages, std::map *all_tensor_map) { // the inline tensors contained in the expression. - auto inline_tensors = ir::CollectIRNodes(*expr, [&](const Expr *x) { + auto inline_tensors = ir::ir_utils::CollectIRNodes(*expr, [&](const Expr *x) { return x->as_tensor() && stages[x->as_tensor()]->inlined(); }); @@ -240,9 +240,10 @@ void ComputeInlineExpand(Expr *expr, TensorInlineExpandMutator(tensor->name, all_tensor_map, stages)(expr); } - inline_tensors = ir::CollectLoadTensors(*expr, [&](const Expr *x) { - return x->as_tensor() && stages[x->as_tensor()]->inlined(); - }); + inline_tensors = + ir::ir_utils::CollectLoadTensors(*expr, [&](const Expr *x) { + return x->as_tensor() && stages[x->as_tensor()]->inlined(); + }); } } diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc index a4feec97626cb..bb546f694be9d 100644 --- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc +++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc @@ -36,9 +36,9 @@ struct EliminateBroadcastInForloop : public ir::IRMutator { auto* node = expr->As(); - auto broadcasts = ir::CollectIRNodes(node->value, [&](const Expr* expr) { - return expr->As(); - }); + auto broadcasts = ir::ir_utils::CollectIRNodes( + node->value, + [&](const Expr* expr) { return expr->As(); }); std::vector let_exprs; Var tmp; @@ -79,7 +79,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator { } bool ContainsLoopVar(Expr expr, Var loop_var) { - return !ir::CollectIRNodes(expr, [&](const Expr* e) -> bool { + return !ir::ir_utils::CollectIRNodes(expr, [&](const Expr* e) -> bool { return e->As() && e->As()->name == loop_var->name; }).empty(); diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc index d12a5c9f2dab8..7b30f75bf9652 100644 --- a/paddle/cinn/optim/transform_gpu_forloop.cc +++ b/paddle/cinn/optim/transform_gpu_forloop.cc @@ -586,7 +586,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> { int BufferSize(ir::Expr indice) { auto copy = IRCopy(indice); - auto vars = ir::CollectIRNodesInOrder( + auto vars = ir::ir_utils::CollectIRNodesInOrder( copy, [](const ir::Expr *expr) { return expr->As(); }); int max_range = 1; diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc index 2f3a9b29a3567..357bafe79730a 100644 --- a/paddle/cinn/optim/vectorize_loops.cc +++ b/paddle/cinn/optim/vectorize_loops.cc @@ -129,7 +129,8 @@ class TensorVectorizeTeller : public ir::IRMutator { // the iter val must appear in the last index if (indices.empty() || - ir::CollectIRNodes(indices.back(), find_matched_var_fn).empty()) { + ir::ir_utils::CollectIRNodes(indices.back(), find_matched_var_fn) + .empty()) { VLOG(5) << "Loop var:" << iter_var_->name << " is not used in the last index"; return false; @@ -137,7 +138,8 @@ class TensorVectorizeTeller : public ir::IRMutator { // the iter val can't appear in mulitple indices for (int i = 0; i < indices.size() - 1; ++i) { - auto repeat_found = ir::CollectIRNodes(indices[i], find_matched_var_fn); + auto repeat_found = + ir::ir_utils::CollectIRNodes(indices[i], find_matched_var_fn); if (!repeat_found.empty()) { VLOG(5) << "Loop var:" << iter_var_->name << " is used at more than last index, current:" << i; @@ -214,7 +216,7 @@ class CudaVectorizer : public IRMutator { } void Visit(Expr *expr) { - write_teller_ = ir::CollectTensorNeedsWrite(expr); + write_teller_ = ir::ir_utils::CollectTensorNeedsWrite(expr); vectorized_teller_.Collect(expr); IRMutator::Visit(expr, expr); } diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc index 309fa5aaa3db4..257de52fe7a5b 100644 --- a/paddle/cinn/poly/domain.cc +++ b/paddle/cinn/poly/domain.cc @@ -70,8 +70,8 @@ void Domain::ExtractParams() { std::unordered_set var_names; auto collect_param_fn = [&](Expr& e) { if (!e.is_constant()) { - auto vars = - ir::CollectIRNodes(e, [](const Expr* e) { return e->is_var(); }); + auto vars = ir::ir_utils::CollectIRNodes( + e, [](const Expr* e) { return e->is_var(); }); for (auto& var : vars) var_names.insert(var.As()->name); } }; diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc index faa7a99c0cfde..e2e5dc531c0f7 100644 --- a/paddle/cinn/poly/stage.cc +++ b/paddle/cinn/poly/stage.cc @@ -805,7 +805,7 @@ void Stage::SimpleComputeAt(Stage *other, int level) { compute_ats_[other->id()] = relation; auto other_expr = other->expr(); auto find_tensors = - ir::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) { + ir::ir_utils::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) { return x->as_tensor() && x->as_tensor_ref()->name == tensor()->name; }); if (!find_tensors.empty()) { @@ -1025,7 +1025,7 @@ Iterator Stage::Fuse(const Iterator &level0, const Iterator &level1) { std::vector Stage::input_statements() const { if (!expr_.defined()) return {}; VLOG(3) << "stage " << id() << " expr: " << expr_; - auto load_exprs = ir::CollectIRNodes( + auto load_exprs = ir::ir_utils::CollectIRNodes( expr_, [](const Expr *x) { return x->As(); }); std::set statements; for (auto &expr : load_exprs) { @@ -1563,10 +1563,11 @@ void Stage::ShareBufferWith(Stage *other) { isl_map *__isl_give GatherAccesses(Stage *stage, const std::string &tensor_name) { CHECK(stage->tensor_); - auto loads = ir::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) { - return x->As() && - x->As()->tensor.as_tensor()->name == tensor_name; - }); + auto loads = + ir::ir_utils::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) { + return x->As() && + x->As()->tensor.as_tensor()->name == tensor_name; + }); auto vars = stage->tensor_->axis_with_reduce(); @@ -1888,7 +1889,7 @@ StageMap CreateStages(const std::vector &tensors) { std::set all_tensors(tensors.begin(), tensors.end()); for (auto &tensor : tensors) { - auto used_tensors = ir::CollectIRNodes( + auto used_tensors = ir::ir_utils::CollectIRNodes( tensor->body(), [](const Expr *x) { return x->as_tensor(); }); for (const Expr &x : used_tensors) { all_tensors.insert(x.as_tensor_ref()); From 27d0fed793ac229645371d4b34c1a6c3970a02c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 21 Sep 2023 14:51:47 +0800 Subject: [PATCH 32/39] move ir_verify from namespace optim to ir_utils (#57532) --- paddle/cinn/backends/codegen_c.cc | 2 +- paddle/cinn/backends/codegen_cuda_dev.cc | 2 +- paddle/cinn/backends/llvm/codegen_llvm.cc | 2 +- paddle/cinn/ir/test/ir_verify_test.cc | 10 ++++++---- paddle/cinn/ir/utils/ir_verify.cc | 12 ++++++++---- paddle/cinn/ir/utils/ir_verify.h | 9 ++++++--- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc index 2345bf53d36cd..6440339947682 100644 --- a/paddle/cinn/backends/codegen_c.cc +++ b/paddle/cinn/backends/codegen_c.cc @@ -38,7 +38,7 @@ using cinn::common::float16; const char *kCKeywordRestrict = "__restrict__"; void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) { - ir::IrVerify(Expr(module)); + ir::ir_utils::IrVerify(Expr(module)); if (!outputs.c_header_name.empty()) { auto source = Compile(module, OutputKind::CHeader); diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc index 1f6f5bba154aa..5a1ddbc450a09 100644 --- a/paddle/cinn/backends/codegen_cuda_dev.cc +++ b/paddle/cinn/backends/codegen_cuda_dev.cc @@ -56,7 +56,7 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) { void CodeGenCUDA_Dev::Compile(const ir::Module &module, const Outputs &outputs) { - ir::IrVerify(Expr(module)); + ir::ir_utils::IrVerify(Expr(module)); CodeGenC::inline_builtin_codes_ = false; if (!outputs.c_header_name.empty()) { diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc index 5ff8ce03c77b0..b91772bd688b8 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm.cc @@ -790,7 +790,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Call *op) { llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) { { Expr body_to_verify(&Reference(op)); - ir::IrVerify(body_to_verify); + ir::ir_utils::IrVerify(body_to_verify); } for (auto &fn : op->functions) { diff --git a/paddle/cinn/ir/test/ir_verify_test.cc b/paddle/cinn/ir/test/ir_verify_test.cc index 06a842ef5ba81..183f20e491fbc 100644 --- a/paddle/cinn/ir/test/ir_verify_test.cc +++ b/paddle/cinn/ir/test/ir_verify_test.cc @@ -18,12 +18,14 @@ #include "paddle/cinn/ir/op/ir_operators.h" -namespace cinn::ir { - +namespace cinn { +namespace ir { +namespace ir_utils { TEST(IrVerify, basic) { Expr a(1); Expr b(1); IrVerify(a + b); } - -} // namespace cinn::ir +} // namespace ir_utils +} // namespace ir +} // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_verify.cc b/paddle/cinn/ir/utils/ir_verify.cc index d0f69802438bb..b961e25114249 100644 --- a/paddle/cinn/ir/utils/ir_verify.cc +++ b/paddle/cinn/ir/utils/ir_verify.cc @@ -17,7 +17,10 @@ #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -namespace cinn::ir { +namespace cinn { +namespace ir { +namespace ir_utils { +namespace { struct IrVerifyVisitor : public ir::IRMutator<> { using ir::IRMutator<>::Visit; @@ -30,10 +33,11 @@ struct IrVerifyVisitor : public ir::IRMutator<> { NODETY_FORALL(__) #undef __ }; - +} // namespace void IrVerify(Expr e) { IrVerifyVisitor visitor; visitor.Visit(&e, &e); } - -} // namespace cinn::ir +} // namespace ir_utils +} // namespace ir +} // namespace cinn diff --git a/paddle/cinn/ir/utils/ir_verify.h b/paddle/cinn/ir/utils/ir_verify.h index deddb3178282d..d47c97e0197d4 100644 --- a/paddle/cinn/ir/utils/ir_verify.h +++ b/paddle/cinn/ir/utils/ir_verify.h @@ -15,8 +15,11 @@ #pragma once #include "paddle/cinn/ir/ir.h" -namespace cinn::ir { +namespace cinn { +namespace ir { +namespace ir_utils { void IrVerify(Expr e); - -} // namespace cinn::ir +} // namespace ir_utils +} // namespace ir +} // namespace cinn From 98be3d95e2041938fa7e783a07ec5cee56251f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 21 Sep 2023 14:52:09 +0800 Subject: [PATCH 33/39] =?UTF-8?q?=E3=80=90CINN=E3=80=91move=20ir=5Freplace?= =?UTF-8?q?=20from=20cinn/optim=20to=20cinn/ir/utils=20(#57524)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move ir_replace from cinn/optim to cinn/ir/utils * delete extra modification --- paddle/cinn/ir/utils/CMakeLists.txt | 3 ++- paddle/cinn/{optim => ir/utils}/ir_replace.cc | 8 +++++--- paddle/cinn/{optim => ir/utils}/ir_replace.h | 7 ++++--- paddle/cinn/optim/CMakeLists.txt | 1 - paddle/cinn/optim/buffer_assign.cc | 2 +- .../cinn/optim/eliminate_broadcast_in_forloop.cc | 4 ++-- paddle/cinn/optim/unroll_loops.cc | 4 ++-- paddle/cinn/optim/vectorize_loops.cc | 16 +++++++++------- paddle/cinn/poly/stage.cc | 2 +- 9 files changed, 26 insertions(+), 21 deletions(-) rename paddle/cinn/{optim => ir/utils}/ir_replace.cc (93%) rename paddle/cinn/{optim => ir/utils}/ir_replace.h (91%) diff --git a/paddle/cinn/ir/utils/CMakeLists.txt b/paddle/cinn/ir/utils/CMakeLists.txt index 5613bf7260155..032bf537d2fce 100644 --- a/paddle/cinn/ir/utils/CMakeLists.txt +++ b/paddle/cinn/ir/utils/CMakeLists.txt @@ -9,4 +9,5 @@ gather_srcs( ir_verify.cc ir_compare.cc ir_nodes_collector.cc - ir_copy.cc) + ir_copy.cc + ir_replace.cc) diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/ir/utils/ir_replace.cc similarity index 93% rename from paddle/cinn/optim/ir_replace.cc rename to paddle/cinn/ir/utils/ir_replace.cc index 3dc39a08a3817..da2305359c5e9 100644 --- a/paddle/cinn/optim/ir_replace.cc +++ b/paddle/cinn/ir/utils/ir_replace.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/cinn/optim/ir_replace.h" +#include "paddle/cinn/ir/utils/ir_replace.h" #include @@ -22,7 +22,8 @@ #include "paddle/cinn/utils/string.h" namespace cinn { -namespace optim { +namespace ir { +namespace ir_utils { using utils::GetStreamCnt; namespace { @@ -65,5 +66,6 @@ void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to) { IrReplaceMutator(from, to)(expr); } -} // namespace optim +} // namespace ir_utils +} // namespace ir } // namespace cinn diff --git a/paddle/cinn/optim/ir_replace.h b/paddle/cinn/ir/utils/ir_replace.h similarity index 91% rename from paddle/cinn/optim/ir_replace.h rename to paddle/cinn/ir/utils/ir_replace.h index 7c95d1e6f6c38..312e4c61eff0a 100644 --- a/paddle/cinn/optim/ir_replace.h +++ b/paddle/cinn/ir/utils/ir_replace.h @@ -18,10 +18,11 @@ #include "paddle/cinn/ir/ir.h" namespace cinn { -namespace optim { +namespace ir { +namespace ir_utils { //! Replace the variable \p v to expression \p e in expression \p expr. void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to); - -} // namespace optim +} // namespace ir_utils +} // namespace ir } // namespace cinn diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt index 1b4a55479ef0b..03b8c95b74173 100755 --- a/paddle/cinn/optim/CMakeLists.txt +++ b/paddle/cinn/optim/CMakeLists.txt @@ -4,7 +4,6 @@ gather_srcs( cinnapi_src SRCS replace_call_with_expr.cc - ir_replace.cc replace_var_with_expr.cc ir_simplify.cc optimize.cc diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc index 175689defbe36..f749cac9ba502 100644 --- a/paddle/cinn/optim/buffer_assign.cc +++ b/paddle/cinn/optim/buffer_assign.cc @@ -17,8 +17,8 @@ #include "paddle/cinn/common/union_find.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" +#include "paddle/cinn/ir/utils/ir_replace.h" #include "paddle/cinn/lang/lower_impl.h" -#include "paddle/cinn/optim/ir_replace.h" namespace cinn { namespace optim { diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc index bb546f694be9d..e836563a9feb0 100644 --- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc +++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc @@ -19,8 +19,8 @@ #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" +#include "paddle/cinn/ir/utils/ir_replace.h" #include "paddle/cinn/ir/utils/ir_visitor.h" -#include "paddle/cinn/optim/ir_replace.h" namespace cinn { namespace optim { @@ -54,7 +54,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator { std::tie(let_expr, tmp) = CreateTmpLet(broadcast); let_exprs.push_back(let_expr); - optim::IrReplace(expr, broadcast, tmp); + cinn::ir::ir_utils::IrReplace(expr, broadcast, tmp); } // insert the let expressions to the outer forloop. diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc index fc5fab85eca5f..32d4037b83e3e 100644 --- a/paddle/cinn/optim/unroll_loops.cc +++ b/paddle/cinn/optim/unroll_loops.cc @@ -21,7 +21,7 @@ #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_replace.h" +#include "paddle/cinn/ir/utils/ir_replace.h" namespace cinn { namespace optim { @@ -95,7 +95,7 @@ struct UnrollMutator : public ir::IRMutator { for (int i = min->value; i < extent->value; i++) { Expr start = op->min + i; body.push_back(optim::IRCopy(op->body)); - optim::IrReplace(&body.back(), op->loop_var, start); + cinn::ir::ir_utils::IrReplace(&body.back(), op->loop_var, start); } *expr = ir::Block::Make(body); diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc index 357bafe79730a..8ed13e9d5971b 100644 --- a/paddle/cinn/optim/vectorize_loops.cc +++ b/paddle/cinn/optim/vectorize_loops.cc @@ -29,7 +29,7 @@ #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_replace.h" +#include "paddle/cinn/ir/utils/ir_replace.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/unroll_loops.h" #include "paddle/cinn/utils/functional.h" @@ -149,11 +149,11 @@ class TensorVectorizeTeller : public ir::IRMutator { // check tensor accessed sequentially by comparing index one by one Expr first_idx = optim::IRCopy(indices.back()); - optim::IrReplace(&first_idx, Expr(iter_var_), Expr(0)); + cinn::ir::ir_utils::IrReplace(&first_idx, Expr(iter_var_), Expr(0)); const auto &interval = var_intervals_->at(iter_var_->name); for (int i = 1; i < interval.r; ++i) { Expr next_idx = optim::IRCopy(indices.back()); - optim::IrReplace(&next_idx, Expr(iter_var_), Expr(i)); + cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i)); auto gap = common::AutoSimplify(Expr(next_idx - first_idx)); if (!gap.As() || gap.as_int32() != i) { VLOG(5) << "Tensor:" << tensor->name @@ -310,7 +310,8 @@ class CudaVectorizer : public IRMutator { // generate a get_addr expr to get the address of the tensor Expr converted_tensor = Load::Make(tensor, indices); - optim::IrReplace(&converted_tensor, iter_var_, Expr(int32_t(0))); + cinn::ir::ir_utils::IrReplace( + &converted_tensor, iter_var_, Expr(int32_t(0))); auto get_addr = ir::intrinsics::GetAddr::Make(converted_tensor); // generate a let expression to cast the tensor into the local vector @@ -888,7 +889,7 @@ struct VectorizeLoops_ : public IRMutator { ForType::Serial, DeviceAPI::UNK, IRCopy(inner_for->body))}); - optim::IrReplace( + cinn::ir::ir_utils::IrReplace( &inner_for_b, inner_for->loop_var, Expr(new_iterator_inner)); Expr out_for_b = For::Make(new_iterator_outer, @@ -898,7 +899,7 @@ struct VectorizeLoops_ : public IRMutator { outer_for->device_api, inner_for_b, outer_for->vectorize_info()); - optim::IrReplace( + cinn::ir::ir_utils::IrReplace( &out_for_b, outer_for->loop_var, Expr(new_iterator_outer)); *expr = Block::Make({out_for_a, out_for_b}); VLOG(2) << *expr; @@ -960,7 +961,8 @@ struct VectorizeLoops_ : public IRMutator { } else { new_index = Expr(forloop->loop_var) * factor + Expr(new_iterator); } - optim::IrReplace(&forloop->body, forloop->loop_var, new_index); + cinn::ir::ir_utils::IrReplace( + &forloop->body, forloop->loop_var, new_index); auto new_forloop = For::Make(new_iterator, forloop->min, make_const(factor), diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc index e2e5dc531c0f7..d74bce1404e5b 100644 --- a/paddle/cinn/poly/stage.cc +++ b/paddle/cinn/poly/stage.cc @@ -28,9 +28,9 @@ #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" +#include "paddle/cinn/ir/utils/ir_replace.h" #include "paddle/cinn/ir/utils/ir_visitor.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_replace.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_var_with_expr.h" #include "paddle/cinn/poly/compute_at_transform.h" From 55b7523779bbbed757c4e5b8294e12df64f79af5 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Thu, 21 Sep 2023 15:36:50 +0800 Subject: [PATCH 34/39] [clang-tidy] NO.23 bugprone-branch-clone (#57522) * clangtidyNo23 * fix * fix --- .clang-tidy | 2 +- .../collective/processgroup_comm_utils.cc | 2 +- paddle/fluid/framework/details/fetch_op_handle.cc | 2 +- paddle/fluid/framework/downpour_worker.cc | 5 ++--- paddle/fluid/framework/executor_cache.cc | 2 +- paddle/fluid/framework/io/fs.cc | 9 ++++----- .../fluid/framework/ir/constant_folding_pass.cc | 4 +--- .../ir/mkldnn/quant_dequant_mkldnn_pass.cc | 5 ++--- .../garbage_collector/event_garbage_collector.cc | 7 ++++--- .../garbage_collector/fast_garbage_collector.cc | 7 ++++--- .../new_executor/interpreter/static_build.cc | 6 ++---- .../framework/new_executor/new_ir_interpreter.cc | 7 ++++--- .../framework/new_executor/program_interpreter.cc | 7 ++++--- paddle/fluid/framework/operator.cc | 9 ++------- paddle/fluid/framework/parallel_executor.cc | 10 ++++------ paddle/fluid/framework/tensor_util.cc | 6 ++++-- paddle/fluid/framework/var_desc.cc | 7 ++----- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++-- paddle/fluid/memory/memcpy.cc | 2 +- paddle/fluid/operators/batch_norm_op.cc | 4 ---- paddle/fluid/operators/data_norm_op.cc | 2 -- .../operators/detection/multiclass_nms_op.cc | 15 ++------------- .../operators/fused/fused_bn_activation_op.cc | 2 -- .../operators/fused/fused_bn_add_activation_op.cc | 2 -- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 2 +- .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 2 +- .../operators/fused/mkldnn/multi_gru_mkldnn_op.cc | 4 ++-- paddle/fluid/operators/inplace_abn_op.cc | 2 -- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/reshape_mkldnn_op.cc | 2 -- paddle/fluid/operators/reader/buffered_reader.cc | 6 ++---- paddle/fluid/operators/sum_op.cc | 2 +- .../pir/phi_kernel_adaptor/phi_kernel_util.cc | 5 ++--- paddle/fluid/platform/place.cc | 6 +----- .../fluid/prim/api/manual_prim/static_prim_api.cc | 2 -- paddle/fluid/pybind/eager_method.cc | 6 ++---- paddle/fluid/pybind/eager_properties.cc | 6 ++---- paddle/fluid/pybind/eager_utils.cc | 9 +++------ paddle/fluid/pybind/inference_api.cc | 2 +- paddle/fluid/pybind/op_function_common.cc | 4 +--- paddle/phi/core/compat/convert_utils.cc | 2 +- paddle/phi/core/kernel_factory.cc | 5 ++--- paddle/phi/infermeta/unary.cc | 2 +- paddle/phi/kernels/cpu/diagonal_grad_kernel.cc | 6 ++---- .../phi/kernels/cpu/generate_proposals_kernel.cc | 8 +------- .../phi/kernels/cpu/send_ue_recv_grad_kernel.cc | 4 ++-- paddle/phi/kernels/funcs/vol2col.cc | 4 ++-- 47 files changed, 79 insertions(+), 142 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 6a6700c192027..924095b4def28 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -4,7 +4,7 @@ bugprone-argument-comment, -bugprone-assert-side-effect, -bugprone-bad-signal-to-kill-thread, -bugprone-bool-pointer-implicit-conversion, --bugprone-branch-clone, +bugprone-branch-clone, bugprone-copy-constructor-init, -bugprone-dangling-handle, -bugprone-dynamic-static-initializers, diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc index 94723906fccb1..eec697f523945 100644 --- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc +++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc @@ -51,7 +51,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { #else return nullptr; #endif - } else if (place.GetType() == phi::AllocationType::CUSTOM) { + } else if (place.GetType() == phi::AllocationType::CUSTOM) { // NOLINT #if defined(PADDLE_WITH_CUSTOM_DEVICE) return static_cast(pg)->XCCLComm( place); diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 2a504b2a0fc2b..b71c476a2c95e 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -120,7 +120,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const { static void TransData(const phi::DenseTensor &src_item, phi::DenseTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { - if (platform::is_gpu_place(src_item.place())) { + if (platform::is_gpu_place(src_item.place())) { // NOLINT #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 8a0406864cde7..e69a25bb32781 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -362,9 +362,8 @@ void DownpourWorker::CopySparseTable() { if (src_table == dest_table) { continue; } else if (!copy_table_config_.sparse_copy_by_feasign()) { - if (feasign_set_.find(src_table) == feasign_set_.end()) { - continue; - } else if (feasign_set_[src_table].empty()) { + if (feasign_set_.find(src_table) == feasign_set_.end() || + feasign_set_[src_table].empty()) { continue; } feanum = fleet_ptr_->CopyTable(src_table, dest_table); diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 64d5ce24d20fe..5613a8dbf155e 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -47,7 +47,7 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { execution_strategy.num_threads_ = 2; break; } - case platform::DeviceType::CUDA: { + case platform::DeviceType::CUDA: { // NOLINT // NOTE: According experiments, one thread is faster in // most model training. execution_strategy.num_threads_ = 1; diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc index a39147a97cf7e..4a689409d412b 100644 --- a/paddle/fluid/framework/io/fs.cc +++ b/paddle/fluid/framework/io/fs.cc @@ -399,13 +399,12 @@ void hdfs_mv(const std::string& src, const std::string& dest) { } int fs_select_internal(const std::string& path) { - if (fs_begin_with_internal(path, "hdfs:")) { - return 1; - } else if (fs_begin_with_internal(path, "afs:")) { + if (fs_begin_with_internal(path, "hdfs:") || + fs_begin_with_internal(path, "afs:")) { return 1; + } else { + return 0; } - - return 0; } std::shared_ptr fs_open_read(const std::string& path, diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc index 3b3f23933fb6d..f8e0ac9475b5d 100644 --- a/paddle/fluid/framework/ir/constant_folding_pass.cc +++ b/paddle/fluid/framework/ir/constant_folding_pass.cc @@ -81,9 +81,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const { std::unordered_map map; for (auto in_node : op_node->inputs) { map[in_node->Name()] = 0; - if (!in_node->Var()->Persistable()) { - input_persis = false; - } else if (!in_node->inputs.empty()) { + if (!in_node->Var()->Persistable() || !in_node->inputs.empty()) { input_persis = false; } } diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc index 8f19225dc53b4..655183dc712c0 100644 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -400,9 +400,8 @@ void QuantDequantMkldnnPass::RemoveFakeOps( if (fake_quantize_types.count(op_node->Name())) { CollectFakeQuantizeOps(graph, op_node, &nodes2rm); - } else if (fake_dequantize_types.count(op_node->Name())) { - CollectFakeDequantizeOps(graph, op_node, &nodes2rm); - } else if (fake_quantize_dequantize_types.count(op_node->Name())) { + } else if (fake_dequantize_types.count(op_node->Name()) || + fake_quantize_dequantize_types.count(op_node->Name())) { CollectFakeDequantizeOps(graph, op_node, &nodes2rm); } else if (onnx_format_quantize_dequantize_types.count(op_node->Name())) { CollectQuantizeDequantizeOpsFromONNXFormat(graph, op_node, &nodes2rm); diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc index e826c94712568..e63164c020c36 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc @@ -88,9 +88,10 @@ void InterpreterCoreEventGarbageCollector::Add( if (var->IsType()) { Add(var->GetMutable()->MoveMemoryHolder(), event, ctx); - } else if (var->IsType< - operators::reader:: - OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + } else if ( + var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // NOLINT // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? } else if (var->IsType()) { diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc index 4bc8b298012ab..e7efc1f10c324 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc @@ -34,9 +34,10 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) { if (var->IsType()) { Add(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType< - operators::reader:: - OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + } else if ( + var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // NOLINT // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? } else if (var->IsType()) { diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc index 0f9bd3f387a92..67b75bb523711 100644 --- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc @@ -267,10 +267,8 @@ phi::TensorBase* GetTensorFormVar(framework::Variable* var) { return var->template GetMutable(); } else if (var->template IsType()) { return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (!var->IsInitialized()) { - // The following is for RAW type of var + } else if (var->template IsType() || + !var->IsInitialized()) { return var->template GetMutable(); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 47823eb82b428..2dc6181180c9d 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -758,9 +758,10 @@ void NewIRInterpreter::RecordStreamForGC(InstructionBase* instr) { if (var->IsType()) { TensorRecordStream(*(var->GetMutable())); - } else if (var->IsType< - operators::reader:: - OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + } else if ( + var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // NOLINT // do nothing } else if (var->IsType()) { TensorRecordStream( diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 1384a9fb487de..2e466962c4d31 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -1292,9 +1292,10 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) { if (var->IsType()) { TensorRecordStream(*(var->GetMutable())); - } else if (var->IsType< - operators::reader:: - OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + } else if ( + var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // NOLINT // do nothing } else if (var->IsType()) { TensorRecordStream( diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9b9979bc70f4c..7a3271a48debc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2777,8 +2777,6 @@ void OperatorWithKernel::ParseInputDataType( const phi::DenseTensor* t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } else if (var->IsType()) { t = &(var->Get().value()); } else if (var->IsType()) { @@ -3221,11 +3219,8 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (var->template IsType()) { tensor_out = var->template GetMutable(); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); - } else if (!var->IsInitialized()) { - // The following is for RAW type of var + } else if (var->template IsType() || + !var->IsInitialized()) { tensor_out = var->template GetMutable(); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } else { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8b6363d93d134..e6c11df275b56 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -693,7 +693,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // broadcast parameters from the 0th device to others: auto need_broadcast = [&]() -> bool { - if (member_->build_strategy_.num_trainers_ > 1) { + if (member_->build_strategy_.num_trainers_ > 1) { // NOLINT // 1. num_tariners would be grater than 1 for nccl distributed training. return true; } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { @@ -936,11 +936,9 @@ void ParallelExecutor::BCastParamsToDevices( auto share_memory = [&] { t->ShareDataWith(main_tensor); }; // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. - if (member_->build_strategy_.async_mode_) { - share_memory(); - } else if (member_->use_all_reduce_ || - member_->IsUseCUDA(member_->use_device_) || - var == "@LR_DECAY_COUNTER@") { + if (member_->use_all_reduce_ || + member_->IsUseCUDA(member_->use_device_) || + var == "@LR_DECAY_COUNTER@") { copy_memory(); } else { share_memory(); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 6fe75d1a90dab..90612e5692595 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -78,7 +78,8 @@ void TensorCopyImpl(const TENSOR& src, auto size = src.numel() * phi::SizeOf(src.dtype()); #endif - if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (platform::is_cpu_place(src_place) && + platform::is_cpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -327,7 +328,8 @@ void TensorCopySync(const phi::DenseTensor& src, return; } auto size = src.numel() * phi::SizeOf(src.dtype()); - if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (platform::is_cpu_place(src_place) && + platform::is_cpu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index b0130e055c075..836ba0fb762b3 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -386,11 +386,8 @@ struct SetVarAttrDescVisitor { template void operator()(T &&v) { using U = std::decay_t; - if (std::is_same::value) { - set_attr_value(v); - } else if (std::is_same::value) { - set_attr_value(v); - } else if (std::is_same>::value) { + if (std::is_same::value || std::is_same::value || + std::is_same>::value) { set_attr_value(v); } else { PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6b57f1fabf4bd..70da22a3240e9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2006,7 +2006,7 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( static_cast(scope), this->GetDeviceContexts())); res->input_or_output_ = true; res->SetName(name); - if (platform::is_cpu_place(place_)) { + if (platform::is_cpu_place(place_)) { // NOLINT res->SetPlace(PaddlePlace::kCPU); } else if (platform::is_ipu_place(place_)) { // Currently, IPUPlace's tensor copy between cpu and ipu has been set in @@ -2057,7 +2057,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( static_cast(scope), this->GetDeviceContexts())); res->input_or_output_ = false; res->SetName(name); - if (platform::is_cpu_place(place_)) { + if (platform::is_cpu_place(place_)) { // NOLINT res->SetPlace(PaddlePlace::kCPU); } else if (platform::is_ipu_place(place_)) { // Currently, IPUPlace's tensor copy between cpu and ipu has been set in diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 656d6273afb3f..cf253d6c4ebdc 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -743,7 +743,7 @@ void Copy(phi::Place dst_place, VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::CPU) { + dst_place.GetType() == phi::AllocationType::CPU) { // NOLINT std::memcpy(dst, src, num); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 4f1c7ab3857d7..1d45cee715409 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -386,8 +386,6 @@ phi::KernelKey BatchNormGradOp::GetExpectedKernelType( const phi::DenseTensor *t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW( @@ -530,8 +528,6 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType( const phi::DenseTensor *t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW( diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index 493351654d5eb..2e70168876162 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -495,8 +495,6 @@ class DataNormGradOp : public framework::OperatorWithKernel { const phi::DenseTensor *t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 432713c60d969..8519752bc1049 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -101,11 +101,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { } // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. - if (score_size == 3) { - ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); - } else { - ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); - } + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); if (!ctx->IsRuntime()) { ctx->SetLoDLevel("Out", std::max(ctx->GetLoDLevel("BBoxes"), 1)); } @@ -584,14 +580,7 @@ class MultiClassNMS2Op : public MultiClassNMSOp { void InferShape(framework::InferShapeContext* ctx) const override { MultiClassNMSOp::InferShape(ctx); - - auto score_dims = ctx->GetInputDim("Scores"); - auto score_size = score_dims.size(); - if (score_size == 3) { - ctx->SetOutputDim("Index", {-1, 1}); - } else { - ctx->SetOutputDim("Index", {-1, 1}); - } + ctx->SetOutputDim("Index", {-1, 1}); if (!ctx->IsRuntime()) { ctx->SetLoDLevel("Index", std::max(ctx->GetLoDLevel("BBoxes"), 1)); } diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc index 88b11f1ef39c5..ca59a466a5c2b 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc @@ -303,8 +303,6 @@ phi::KernelKey FusedBatchNormActGradOp::GetExpectedKernelType( const phi::DenseTensor *t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW( diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc index a33a91b082e5c..ed416d4ad13d1 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc @@ -267,8 +267,6 @@ phi::KernelKey FusedBatchNormAddActGradOp::GetExpectedKernelType( const phi::DenseTensor *t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW( diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 05d1e64f92ae7..5ec5e8081bb6f 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -248,7 +248,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { const bool force_fp32_output = ctx.Attr("force_fp32_output"); // BF16 does not support force output - if (!is_bf16 && force_fp32_output) { + if (!is_bf16 && force_fp32_output) { // NOLINT RunKernel(ctx); } else { RunKernel(ctx); diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index d973c5e89a626..4972db5804322 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -329,7 +329,7 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel { const bool force_fp32_output = ctx.Attr("force_fp32_output"); // BF16 does not support force output - if (!is_bf16 && force_fp32_output) { + if (!is_bf16 && force_fp32_output) { // NOLINT RunKernel(ctx); } else { RunKernel(ctx); diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 90ecbe4506d98..1c8e0a1b56a97 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -688,7 +688,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel { const bool force_fp32_output = ctx.HasAttr("force_fp32_output") && ctx.Attr("force_fp32_output"); - if (force_fp32_output) { + if (force_fp32_output) { // NOLINT RunKernel(ctx); } else { RunKernel(ctx); @@ -706,7 +706,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel { auto gru_out_L2R = handler.executeSingleGru(input_mem, layer, L2R); handler.reorderInputL2RtoR2L(input_mem, layer); auto gru_out_R2L = handler.executeSingleGru(input_mem, layer, R2L); - if (layer < layers - 1) + if (layer < layers - 1) // NOLINT handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); else handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index eee0f1f304bc3..a53a9867b9903 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -284,8 +284,6 @@ class InplaceABNGradOp : public framework::OperatorWithKernel { const phi::DenseTensor* t = nullptr; if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); } if (t == nullptr) { PADDLE_THROW( diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index a7f6bc512ffce..692b7f0721ceb 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -359,7 +359,7 @@ class FCMKLDNNKernel : public framework::OpKernel { bool fuse_relu = ctx.Attr("activation_type") == "relu"; IF_CHANGE_FC_TW_TYPENAME((std::is_same::value), ([&] { - if (force_fp32_output) { + if (force_fp32_output) { // NOLINT this->RunKernel(ctx); } else if (phi::funcs::is_int8()) { if (fuse_relu) { diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index b7a33edb82a00..3c53b05152b7e 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -105,8 +105,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { InferShapeSqueezeOp(ctx, x_dims, out_dims); break; case ReshapeKernelOpName::flatten: - InferShapeFlattenOp(ctx, x_dims, out_dims); - break; case ReshapeKernelOpName::flatten2: InferShapeFlattenOp(ctx, x_dims, out_dims); break; diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 2e24caa91c6bb..b73ffe4319be7 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -213,10 +213,8 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_ptr = cpu[i].data(); auto gpu_ptr = gpu_ptrs[i]; auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); - if (platform::is_cuda_pinned_place(cpu_place)) { - memory::Copy( - place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get()); - } else if ((platform::is_gpu_place(cpu_place))) { + if (platform::is_cuda_pinned_place(cpu_place) || + platform::is_gpu_place(cpu_place)) { memory::Copy( place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get()); } else { diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 5cf9fba9f2681..ebb4cd7cf132d 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -76,7 +76,7 @@ class SumOp : public framework::OperatorWithKernel { // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL if (!((data_type == framework::proto::VarType::FP32 || data_type == framework::proto::VarType::BF16) && - ctx.OutputVar("Out")->IsType())) { + ctx.OutputVar("Out")->IsType())) { // NOLINT this->SetDnnFallback(true); } else if (!std::all_of(x_vars.begin(), x_vars.end(), diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc index a3997ee97db6a..437523e41bf3e 100644 --- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc @@ -196,9 +196,8 @@ void BuildValue(pir::Value value, variable_list); } // Only support DenseTensor or Vector - if (!value.type()) { - var->GetMutable(); - } else if (value.type().isa()) { + if (!value.type() || + value.type().isa()) { var->GetMutable(); } else if (value.type().isa()) { var->GetMutable(); diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index b8452a594e358..d38d0418e4639 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -62,11 +62,7 @@ bool is_same_place(const Place &p1, const Place &p2) { if (places_are_same_class(p1, p2)) { if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) { return true; - } else if (is_xpu_place(p1)) { - return p1 == p2; - } else if (is_ipu_place(p1)) { - return p1 == p2; - } else if (is_custom_place(p1)) { + } else if (is_xpu_place(p1) || is_ipu_place(p1) || is_custom_place(p1)) { return p1 == p2; } else { return p1 == p2; diff --git a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc index c907be2d10256..c45a473b4a8d3 100644 --- a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc +++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc @@ -50,8 +50,6 @@ Tensor full(const IntArray& shape, op->SetAttr("shape", shape.GetData()); switch (dtype) { case phi::DataType::FLOAT16: - op->SetAttr("str_value", std::to_string(value.to())); - break; case phi::DataType::BFLOAT16: op->SetAttr("str_value", std::to_string(value.to())); break; diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 59ef86423788a..e72f5dc77f99c 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1617,7 +1617,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, py::isinstance(value_obj_tmp) || py::isinstance(value_obj_tmp) || PyComplex_Check(value_obj)) { - if (self->tensor.dtype() == phi::DataType::FLOAT32) { + if (self->tensor.dtype() == phi::DataType::FLOAT32 || + self->tensor.dtype() == phi::DataType::FLOAT16) { attrs["values"] = std::vector{ value_obj_tmp.cast()}; } else if (self->tensor.dtype() == phi::DataType::FLOAT64) { @@ -1632,9 +1633,6 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, } else if (self->tensor.dtype() == phi::DataType::BOOL) { attrs["values"] = std::vector{ value_obj_tmp.cast()}; - } else if (self->tensor.dtype() == phi::DataType::FLOAT16) { - attrs["values"] = std::vector{ - value_obj_tmp.cast()}; } else if (self->tensor.dtype() == phi::DataType::COMPLEX64) { attrs["values"] = std::vector{ value_obj_tmp.cast>()}; diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 59ecee2c5d668..517c210830022 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -92,13 +92,11 @@ Tensor's type. PyObject* tensor_properties_get_type(TensorObject* self, void* closure) { EAGER_TRY - if (!self->tensor.defined()) { + if (!self->tensor.defined() || self->tensor.is_dense_tensor()) { // be same to old dygraph return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR); } - if (self->tensor.is_dense_tensor()) { - return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR); - } else if (self->tensor.is_selected_rows()) { + if (self->tensor.is_selected_rows()) { return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS); } else if (egr::IsVariableCompatTensor(self->tensor)) { return ToPyObject(static_cast( diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 0432ca88d6ada..87660d9fd88ca 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -173,13 +173,11 @@ bool PyObject_CheckIRVectorOfOpResult(PyObject* obj) { } } bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) { - if (obj == Py_None) { + if (obj == Py_None || obj == Py_False) { return false; // To be compatible with QA integration testing. Some // test cases pass in None. } else if (obj == Py_True) { return true; - } else if (obj == Py_False) { - return false; } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " @@ -1125,9 +1123,8 @@ static paddle::Tensor& GetTensorFromPyObject(const std::string& op_type, return emptytensor; } - if (PyObject_TypeCheck(obj, p_tensor_type)) { - return reinterpret_cast(obj)->tensor; - } else if (PyObject_TypeCheck(obj, p_string_tensor_type)) { + if (PyObject_TypeCheck(obj, p_tensor_type) || + PyObject_TypeCheck(obj, p_string_tensor_type)) { return reinterpret_cast(obj)->tensor; } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b1fbf43aac8b6..bd569f328b115 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -239,7 +239,7 @@ void PaddleInferTensorCreate(paddle_infer::Tensor &tensor, // NOLINT paddle_infer::PlaceType ToPaddleInferPlace( phi::AllocationType allocation_type) { - if (allocation_type == phi::AllocationType::CPU) { + if (allocation_type == phi::AllocationType::CPU) { // NOLINT return paddle_infer::PlaceType::kCPU; } else if (allocation_type == phi::AllocationType::GPU) { return paddle_infer::PlaceType::kGPU; diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 366465e6b2984..9d8074628fb13 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -121,13 +121,11 @@ bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); } bool CastPyArg2Boolean(PyObject* obj, const std::string& op_type, ssize_t arg_pos) { - if (obj == Py_None) { + if (obj == Py_None || obj == Py_False) { return false; // To be compatible with QA integration testing. Some // test case pass in None. } else if (obj == Py_True) { return true; - } else if (obj == Py_False) { - return false; } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index d82b37328850f..d4c5de0dbe6dc 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -67,7 +67,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #endif #ifdef PADDLE_WITH_DNNL - case phi::Backend::ONEDNN: + case phi::Backend::ONEDNN: // NOLINT return phi::CPUPlace(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index d58decadfadca..f9c1dca46b2fb 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -63,9 +63,8 @@ KernelFactory& KernelFactory::Instance() { bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const { if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) { - if (phi::OpUtilsMap::Instance().Contains(op_type)) { - return true; - } else if (kernels_.find(op_type) != kernels_.end()) { + if (phi::OpUtilsMap::Instance().Contains(op_type) || + (kernels_.find(op_type) != kernels_.end())) { return true; } } diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index aa1b6526cd5f8..e0df80157013e 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1130,7 +1130,7 @@ void ExpandInferMeta(const MetaTensor& x, std::max(static_cast(x_dims.size()), expand_shape.size()); std::vector out_shape(out_rank); for (int i = 0; i < static_cast(expand_shape.size()); ++i) { - if (x_dims[i] == -1) { + if (x_dims[i] == -1) { // NOLINT out_shape[i] = -1; } else if (expand_shape[i] == -1) { if (static_cast(x_dims.size()) > i) { diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc index 5ccb5ad8c43b4..d8383b45beb79 100644 --- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc @@ -63,10 +63,8 @@ void DiagonalGradKernel(const Context& dev_ctx, idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_)); bool flag = false; - if (offset_ == 0 && axis1_dim == axis2_dim) { - idx_dim.push_back(axis1_dim); - flag = true; - } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) { + if ((offset_ == 0 && axis1_dim == axis2_dim) || + (offset_ > 0 && (axis1_dim + offset_) == axis2_dim)) { idx_dim.push_back(axis1_dim); flag = true; } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) { diff --git a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc index 2e468ef2d07ff..e9764035613ed 100644 --- a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc +++ b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc @@ -52,13 +52,7 @@ void ClipTiledBoxes(const phi::CPUContext& ctx, T im_h = is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0]; for (int64_t i = 0; i < input_boxes.numel(); ++i) { - if (i % 4 == 0) { - out_data[i] = - std::max(std::min(input_boxes_data[i], im_w - offset), zero); - } else if (i % 4 == 1) { - out_data[i] = - std::max(std::min(input_boxes_data[i], im_h - offset), zero); - } else if (i % 4 == 2) { + if ((i % 4 == 0) || (i % 4 == 2)) { out_data[i] = std::max(std::min(input_boxes_data[i], im_w - offset), zero); } else { diff --git a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc index 0ca3be62a3971..fac19f142dffc 100644 --- a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc @@ -256,7 +256,7 @@ void CalculateEGrad(const T* out_grad_data, for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - if (message_op == "ADD") { + if (message_op == "ADD") { // NOLINT #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif @@ -283,7 +283,7 @@ void CalculateEGrad(const T* out_grad_data, for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - if (message_op == "ADD") { + if (message_op == "ADD") { // NOLINT #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc index 0f411b8894ce9..e505fcb3de337 100644 --- a/paddle/phi/kernels/funcs/vol2col.cc +++ b/paddle/phi/kernels/funcs/vol2col.cc @@ -66,7 +66,7 @@ class Vol2ColFunctor { // changed bool paddings_size_is_6 = (paddings.size() == 6); - int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_forth = paddings[0]; int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; @@ -191,7 +191,7 @@ class Col2VolFunctor { input_channels * filter_depth * filter_height * filter_width; bool paddings_size_is_6 = (paddings.size() == 6); - int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_forth = paddings[0]; int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; From 4c856f9d714999ade2cea66728c8e498067c5c1d Mon Sep 17 00:00:00 2001 From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:33:11 +0800 Subject: [PATCH 35/39] [PRIM][PIR]Migrate prim rules (#57554) * fix bugs of generating Op::Build when Op has optional tensor * add default constructor for IrMetaTensor * fix bugs * polish guard * pir support prim gelu and rsqrt * support prim bwd ops * migrate vjp rules of cast,add,multiply,elementwise_pow * add cast as primitive op * fix bugs in elementwise_pow_grad * add test for cast_grad * add test for elementwise_add_grad * add test for elementwise_mul_grad * add test for elementwise_pow_grad * fix bugs * fix bugs * support pir prim backward ops * refien * fix bug * migrate layer_norm custom vjp rules to pir * fix bugs in ir_backward * fix backward , scope, and concat_grad prim * add layer_norm fwd decompose logic * fix pow * change _use_new_ir_api to in_pir_mode * add _static_guard * fix * fix executor cuda700 error caused by full and full_like * refine * add vjp rules * fix bugs * add scope * add test * add add op prim rules --------- Co-authored-by: YuanRisheng Co-authored-by: cyber-pioneer Co-authored-by: Charles-hit Co-authored-by: zhangbo9674 --- paddle/fluid/primitive/codegen/gen.py | 18 +- .../rule/vjp/generated/generated_vjp.cc.j2 | 2 +- paddle/fluid/primitive/primitive.yaml | 1 + paddle/fluid/primitive/rule/vjp/details.h | 389 ++++++++++++++++-- paddle/fluid/pybind/ir.cc | 4 + paddle/phi/api/yaml/legacy_backward.yaml | 2 +- python/paddle/autograd/ir_backward.py | 5 +- python/paddle/decomposition/rules.py | 80 ++++ python/paddle/tensor/creation.py | 9 +- test/legacy_test/prim_op_test.py | 9 +- test/legacy_test/test_activation_op.py | 79 +++- test/legacy_test/test_cast_op.py | 10 +- test/legacy_test/test_concat_op.py | 162 +++++++- test/legacy_test/test_elementwise_add_op.py | 10 + test/legacy_test/test_elementwise_mul_op.py | 19 +- test/legacy_test/test_elementwise_pow_op.py | 19 +- test/legacy_test/test_layer_norm_op.py | 56 ++- test/legacy_test/test_reshape_op.py | 19 +- test/legacy_test/test_split_op.py | 25 +- test/legacy_test/test_sum_op.py | 31 +- test/legacy_test/test_transpose_op.py | 40 +- 21 files changed, 882 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py index f9a920730967d..e0eeeb10a3a4d 100644 --- a/paddle/fluid/primitive/codegen/gen.py +++ b/paddle/fluid/primitive/codegen/gen.py @@ -72,8 +72,20 @@ ] -PRIM_VJP = ['divide_grad', 'sum_grad'] # vjp list of primitive op -CUSTOM_VJP = ['gelu_grad'] # custom vjp list of composite op +PRIM_VJP = [ + 'divide_grad', + 'sum_grad', + 'cast_grad', + 'add_grad', + 'multiply_grad', + 'elementwise_pow_grad', + 'reshape_grad', + 'split_grad', + 'tanh_grad', + 'transpose_grad', + 'concat_grad', +] # vjp list of primitive op +CUSTOM_VJP = ['gelu_grad', 'layer_norm_grad'] # custom vjp list of composite op VJP_COMPS = PRIM_VJP + CUSTOM_VJP BACKENDS = [ @@ -149,6 +161,8 @@ 'embedding_grad', 'sqrt', 'uniform', + 'split', + 'transpose', ] diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 index 1ab275ceaecbf..6737a73d69eb5 100644 --- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 +++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 @@ -106,7 +106,7 @@ paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{ {% else %} std::vector {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr); for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) { - {{api.outputs[i].name}} = !stop_gradients[{{i}}][i] ? &vjp_res[{{i}}][i] : nullptr; + {{api.outputs[i].name}}[i] = !stop_gradients[{{i}}][i] ? &vjp_res[{{i}}][i] : nullptr; } {% endif %} {% endfor %} diff --git a/paddle/fluid/primitive/primitive.yaml b/paddle/fluid/primitive/primitive.yaml index a42e2503e31ba..ccf9673bafba0 100644 --- a/paddle/fluid/primitive/primitive.yaml +++ b/paddle/fluid/primitive/primitive.yaml @@ -49,3 +49,4 @@ - erf - tanh - full +- cast diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index eb640a4643ed3..96b4d051b7cde 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -134,32 +134,371 @@ void gelu_grad(const Tensor& x, // Promote to fp32 when the input type is fp16 for keeping consistent with // phi kernel - // Scale only support fp32 attr in static graph mode, use elementwise_xx - // when precision is over fp32. - if (approximate) { - auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; - auto kKappa = 0.044715; - auto x_sq = x * x; - auto x_cube = x_sq * x; - auto inner = kBeta * (x + kKappa * x_cube); - auto tanh_inner = tanh(inner); - - auto left = scale(x, 0.5); - auto right = scale(tanh_inner, 1., 1.); - - auto left_derivative = scale(right, 0.5); - - auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); - auto inner_derivative = kBeta * (scale(3 * kKappa * x_sq, 1., 1.)); - auto right_derivative = left * tanh_derivative * inner_derivative; - - set_output(out_grad * (left_derivative + right_derivative), x_grad); + if (x.dtype() == phi::DataType::FLOAT16 || + x.dtype() == phi::DataType::BFLOAT16) { + auto promoted_x = cast(x, phi::DataType::FLOAT32); + auto promoted_out_grad = cast(out_grad, phi::DataType::FLOAT32); + if (approximate) { + float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5; + float kkappa = 0.044715; + auto x_sq = promoted_x * promoted_x; + auto x_cube = x_sq * promoted_x; + auto inner = kbeta * (promoted_x + kkappa * x_cube); + auto tanh_inner = tanh(inner); + + auto left = scale(promoted_x, 0.5); + auto right = scale(tanh_inner, 1., 1.); + + auto left_derivative = scale(right, 0.5); + + auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); + auto inner_derivative = kbeta * (scale(3 * kkappa * x_sq, 1., 1.)); + auto right_derivative = left * tanh_derivative * inner_derivative; + + set_output( + cast(promoted_out_grad * (left_derivative + right_derivative), + x.type()), + x_grad); + } else { + float kalpha = M_SQRT1_2; + float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + auto cdf = scale(scale(erf(kalpha * promoted_x), 1., 1.), 0.5); + auto pdf = kbeta * exp(scale(promoted_x * promoted_x, -0.5)); + set_output( + cast(promoted_out_grad * (cdf + promoted_x * pdf), x.type()), + x_grad); + } } else { - auto kAlpha = M_SQRT1_2; - auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; - auto cdf = scale(scale(erf(kAlpha * x), 1., 1.), 0.5); - auto pdf = kBeta * exp(scale(x * x, -0.5)); - set_output(out_grad * (cdf + x * pdf), x_grad); + // Scale only support fp32 attr in static graph mode, use elementwise_xx + // when precision is over fp32. + if (approximate) { + auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + auto kKappa = 0.044715; + auto x_sq = x * x; + auto x_cube = x_sq * x; + auto inner = kBeta * (x + kKappa * x_cube); + auto tanh_inner = tanh(inner); + + auto left = scale(x, 0.5); + auto right = scale(tanh_inner, 1., 1.); + + auto left_derivative = scale(right, 0.5); + + auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); + auto inner_derivative = kBeta * (scale(3 * kKappa * x_sq, 1., 1.)); + auto right_derivative = left * tanh_derivative * inner_derivative; + + set_output(out_grad * (left_derivative + right_derivative), x_grad); + } else { + auto kAlpha = M_SQRT1_2; + auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + auto cdf = scale(scale(erf(kAlpha * x), 1., 1.), 0.5); + auto pdf = kBeta * exp(scale(x * x, -0.5)); + set_output(out_grad * (cdf + x * pdf), x_grad); + } + } +} + +template +void reshape_grad(const Tensor& x, const Tensor& grad_out, Tensor* grad_x) { + if (grad_x) { + auto grad_x_tmp = reshape(grad_out, phi::vectorize(x.dims())); + set_output(grad_x_tmp, grad_x); + } +} + +template +void transpose_grad(const Tensor& grad_out, + const std::vector& perm, + Tensor* grad_x) { + if (grad_x) { + std::vector reverse_perm(perm); + // make origin ranks + for (int i = 0; i < static_cast(perm.size()); ++i) { + if (perm[i] >= 0) { + reverse_perm[perm[i]] = i; + } else { + reverse_perm[perm[i] + perm.size()] = i; + } + } + auto grad_x_tmp = transpose(grad_out, reverse_perm); + set_output(grad_x_tmp, grad_x); + } +} + +template +void tanh_grad(const Tensor& out, const Tensor& grad_out, Tensor* grad_x) { + if (!grad_x) return; + auto grad_x_tmp = grad_out * (1 - out * out); + set_output(grad_x_tmp, grad_x); +} + +template +void concat_grad(const std::vector& x, + const Tensor& out_grad, + const Scalar& axis, + std::vector x_grad) { + int axis_value = axis.to(); + int rank = x[0].dims().size(); + if (axis_value < 0) { + axis_value = axis_value + rank; + } + axis_value = axis_value > 0 ? axis_value : 0; + std::vector sections; + int x_num = x.size(); + for (int i = 0; i < x_num; ++i) { + sections.push_back(x[i].dims()[axis_value]); + } + std::vector x_grad_tmp = + split(out_grad, IntArray(sections), axis_value); + for (int i = 0; i < x_num; ++i) { + if (x_grad[i]) { + set_output(x_grad_tmp.at(i), x_grad.at(i)); + } + } +} + +template +void split_grad(const std::vector& out_grad, + const Scalar& axis, + Tensor* x_grad) { + if (x_grad) { + auto grad = concat(out_grad, axis); + set_output(grad, x_grad); + } +} + +template +void cast_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) { + if (x_grad) { + auto res = cast(out_grad, x.dtype()); + set_output(res, x_grad); + } +} + +template +void add_grad(const Tensor& x, + const Tensor& y, + const Tensor& out_grad, + int axis, + Tensor* dx, + Tensor* dy) { + if (dy) { + if (x.dims() != y.dims()) { + // Maybe need reduce here + phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims()); + if (!reduce_dim.size()) { + set_output(out_grad, dy); + } else { + auto dy_reduce_res = + out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false); + auto dy_tmp = reshape(dy_reduce_res, phi::vectorize(y.dims())); + set_output(dy_tmp, dy); + } + + } else { + set_output(out_grad, dy); + } + } + if (dx) { + if (y.dims() != x.dims()) { + // Maybe need reduce here + auto reduce_dim = get_reduce_dims(x.dims(), y.dims()); + if (!reduce_dim.size()) { + set_output(out_grad, dx); + } else { + auto dx_reduce_res = + out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false); + auto dx_tmp = reshape(dx_reduce_res, phi::vectorize(x.dims())); + set_output(dx_tmp, dx); + } + } else { + set_output(out_grad, dx); + } + } +} + +template +void multiply_grad(const Tensor& x, + const Tensor& y, + const Tensor& out_grad, + int axis, + Tensor* x_grad, + Tensor* y_grad) { + if (x_grad) { + auto x_grad_unreduce = out_grad * y; + if (x_grad_unreduce.dims() != x.dims()) { + auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims()); + if (!axes.size()) { + set_output(x_grad_unreduce, x_grad); + } else { + auto x_grad_reduced = x_grad_unreduce.sum( + phi::vectorize(axes), x_grad_unreduce.dtype(), false); + if (x_grad_reduced.dims().size() != x.dims().size()) { + x_grad_reduced = reshape(x_grad_reduced, x.shape()); + } + set_output(x_grad_reduced, x_grad); + } + } else { + set_output(x_grad_unreduce, x_grad); + } + } + if (y_grad) { + auto y_grad_unreduce = out_grad * x; + if (y_grad_unreduce.dims() != y.dims()) { + auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims()); + if (!axes.size()) { + set_output(y_grad_unreduce, y_grad); + } else { + auto y_grad_reduced = y_grad_unreduce.sum( + phi::vectorize(axes), y_grad_unreduce.dtype(), false); + if (y_grad_reduced.dims().size() != y.dims().size()) { + y_grad_reduced = reshape(y_grad_reduced, y.shape()); + } + set_output(y_grad_reduced, y_grad); + } + } else { + set_output(y_grad_unreduce, y_grad); + } + } +} + +template +void elementwise_pow_grad(const Tensor& x, + const Tensor& y, + const Tensor& out_grad, + Tensor* dx, + Tensor* dy) { + if (dy) { + // dy = lnx * x^y + auto lnx = log(x); + auto x_pow_y = elementwise_pow(x, y); + auto dy_res = lnx * x_pow_y * out_grad; + if (x.dims() != y.dims()) { + // Maybe need reduce here + phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims()); + if (!reduce_dim.size()) { + set_output(dy_res, dy); + } else { + auto dy_reduce_res = + dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false); + auto dy_tmp = reshape(dy_reduce_res, phi::vectorize(y.dims())); + set_output(dy_tmp, dy); + } + } else { + set_output(dy_res, dy); + } + } // indicate we will compute dy + if (dx) { + // dx = y * x^(y-1) + auto tmp_z = y - 1.0; + auto x_pow_z = elementwise_pow(x, tmp_z); + auto dx_res = y * x_pow_z * out_grad; + if (y.dims() != x.dims()) { + // Maybe need reduce here + auto reduce_dim = get_reduce_dims(x.dims(), y.dims()); + if (!reduce_dim.size()) { + set_output(dx_res, dx); + } else { + auto dx_reduce_res = + dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false); + auto dx_tmp = reshape(dx_reduce_res, phi::vectorize(x.dims())); + set_output(dx_tmp, dx); + } + + } else { + set_output(dx_res, dx); + } + } // indicate we will compute dx +} + +template +void layer_norm_grad(const Tensor& x, + const paddle::optional& scale, + const paddle::optional& bias, + const Tensor& mean, + const Tensor& variance, + const Tensor& out_grad, + float epsilon, + int begin_norm_axis, + Tensor* x_grad, + Tensor* scale_grad, + Tensor* bias_grad) { + auto x_dims = x.dims(); + auto shape_1 = 1; // front part + auto shape_2 = 1; // back part + for (int i = 0; i < begin_norm_axis; ++i) { + shape_1 *= x_dims[i]; + } + for (int i = begin_norm_axis; i < x.dims().size(); ++i) { + shape_2 *= x_dims[i]; + } + auto scale_ptr = scale.get_ptr(); + auto bias_ptr = bias.get_ptr(); + + auto x_cast = reshape(x, std::vector({shape_1, shape_2})); + auto out_grad_cast = + reshape(out_grad, std::vector({shape_1, shape_2})); + auto mean_ = reshape(mean, std::vector({shape_1, 1})); + auto variance_ = reshape(variance, std::vector({shape_1, 1})); + + Tensor scale_cast; + if (scale_ptr) { + scale_cast = reshape(*scale_ptr, std::vector({1, shape_2})); + } + + // cast dtype to float32 if dtype =float16 or bfloat16 + + auto x_sub_mean = x_cast - mean_; // M,N + auto tmp = (1.0 / (variance_ + epsilon)); // M,1 + // auto sqrt_var_1 = sqrt(tmp); // M,1 + auto sqrt_var_1 = elementwise_pow( + tmp, full(phi::vectorize(tmp.dims()), 0.5, tmp.dtype())); + auto x_sub_mean_mul_sqrt_var_1 = x_sub_mean * sqrt_var_1; + + if (x_grad) { + auto out_grad_scale = out_grad_cast; // M,N + if (scale_ptr) { + out_grad_scale = out_grad_cast * scale_cast; // M,N * 1,N = M,N + } + + auto dx_end = sqrt_var_1 * out_grad_scale; + auto d_mean = + dx_end.sum(std::vector({1}), x_cast.dtype(), true); // M,1 + + auto d_std_1 = + (tmp * x_sub_mean * out_grad_scale) + .sum(std::vector({1}), x_cast.dtype(), true); // M,1 + auto d_std = d_std_1 * x_sub_mean_mul_sqrt_var_1; // M,1 * M,N = M,N + + auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std); + auto x_grad_tmp = dx_end - d_mean_d_std; + x_grad_tmp = reshape(x_grad_tmp, phi::vectorize(x.dims())); + + set_output(x_grad_tmp, x_grad); + } + + if (scale_grad) { + if (scale_ptr) { + auto scale_grad_tmp = + (x_sub_mean_mul_sqrt_var_1 * out_grad_cast) + .sum(std::vector({0}), x_cast.dtype(), true); + scale_grad_tmp = reshape(scale_grad_tmp, scale_ptr->shape()); + set_output(scale_grad_tmp, scale_grad); + } else { + scale_grad = nullptr; + } + } + + if (bias_grad) { + if (bias_ptr) { + auto bias_grad_tmp = + out_grad_cast.sum(std::vector({0}), x_cast.dtype(), true); + bias_grad_tmp = reshape(bias_grad_tmp, bias_ptr->shape()); + set_output(bias_grad_tmp, bias_grad); + } else { + bias_grad = nullptr; + } } } diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 22fd0f40a36b5..80ecad93997db 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -464,6 +464,10 @@ void BindOpResult(py::module *m) { [](OpResult &self, OpResult &other) { return paddle::dialect::add(self, other); }) + .def("__add__", + [](OpResult &self, float &bias) { + return paddle::dialect::scale(self, 1.0, bias, false); + }) .def("__sub__", [](OpResult &self, OpResult &other) { return paddle::dialect::subtract(self, other); diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index a8260bb816865..9b5db92c54700 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -224,7 +224,7 @@ infer_meta : func : GeneralBinaryGradInferMeta param: [x, y] - composite : elementwise_pow_grad(x, y, out_grad, axis, x_grad, y_grad) + composite : elementwise_pow_grad(x, y, out_grad, x_grad, y_grad) kernel : func : elementwise_pow_grad diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index e33c3a38bff74..f8a2aae71b0cd 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -94,7 +94,6 @@ def prepare_grad_outputs(grad_outputs, outputs, state): dtype=output.dtype, ) fillop = output_grad.get_defining_op() - update_bwdop_structure( backward_ops, state.op_to_opgrad[output.get_defining_op()], @@ -138,14 +137,14 @@ def prepare_grad_outputs(grad_outputs, outputs, state): 0.0, opresult.dtype, ) - fillop = grad.get_defining_op() + fillop = grad_value.get_defining_op() update_bwdop_structure( backward_ops, state.op_to_opgrad[opresult.get_defining_op()], fillop, ) - state.value_to_valuegrad[opresult] = [grad_value] + state.value_to_valuegrad[opresult] = [[grad_value]] visited_output.add(opresult) diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py index e9d04ede061ce..26a4ae73debd0 100644 --- a/python/paddle/decomposition/rules.py +++ b/python/paddle/decomposition/rules.py @@ -63,3 +63,83 @@ def gelu_composite(x, approximate): cdf = half * (one + _ir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype))) out = x * cdf return out + + +@register_decomp('pd_op.rsqrt') +def rsqrt_composite(x): + """define composite rule of op rsqrt.""" + # rsqrt(x) = x^(-0.5) + is_amp = False + from paddle.base.data_feeder import convert_dtype + + dtype = convert_dtype(x.dtype) + if dtype in ["float16", "uint16"]: + is_amp = True + x = cast(x, "float32") + y = full(x.shape if len(x.shape) == 0 else [1], -0.5, x.dtype) + res = pow(x, y) + return res if not is_amp else cast(res, dtype) + + +@register_decomp('pd_op.pow') +def pow_composite(x, y): + """ + define composite rule of op pow + res = x^y + """ + is_amp = False + from paddle.base.data_feeder import convert_dtype + + dtype = convert_dtype(x.dtype) + if dtype in ["float16", "uint16"]: + is_amp = True + x = cast(x, "float32") + + if isinstance(y, (int, float)): + y = full(x.shape if len(x.shape) == 0 else [1], y, x.dtype) + res = pow(x, y) + if is_amp: + res = cast(res, dtype) + return res + + +@register_decomp('pd_op.layer_norm') +def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis): + """ + define composite rule of op layer_norm + out = (x - mean(x)) / sqrt(var + epsilon)) + var = mean((x-mean(x))^2) + """ + is_amp = False + from paddle.base.data_feeder import convert_dtype + + dtype = convert_dtype(x.dtype) + if dtype in ["float16", "uint16"]: + is_amp = True + x = cast(x, "float32") + scale = cast(scale, "float32") if scale else scale + bias = cast(bias, "float32") if bias else bias + + axis = tuple(range(begin_norm_axis, len(x.shape))) + mean_ = mean(x, axis=axis, keepdim=True) + difference = x - mean_ + var_tmp1 = difference * difference + variance = mean(var_tmp1, axis=axis, keepdim=True) + var_tmp3 = variance + epsilon + rsqrt_var = rsqrt(var_tmp3) + out = difference * rsqrt_var + + if scale is not None: + if x.shape[begin_norm_axis:] != scale.shape: + scale = reshape(scale, x.shape[begin_norm_axis:]) + out = out * scale + if bias is not None: + if x.shape[begin_norm_axis:] != bias.shape: + bias = reshape(bias, x.shape[begin_norm_axis:]) + out = out + bias + + mean_ = reshape(mean_, [-1]) + variance = reshape(variance, [-1]) + if is_amp: + out = cast(out, dtype) + return out, mean_, variance diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index c3e814cc906d4..f764fbb45996d 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -833,8 +833,7 @@ def full_like(x, fill_value, dtype=None, name=None): if in_dynamic_mode(): return _C_ops.full_like(x, fill_value, dtype, x.place) elif in_pir_mode(): - place = _current_expected_place() - return _C_ops.full_like(x, fill_value, dtype, place) + return _C_ops.full_like(x, fill_value, dtype, core.Place()) else: helper = LayerHelper("full_like", **locals()) check_variable_and_dtype( @@ -881,7 +880,11 @@ def full_like(x, fill_value, dtype=None, name=None): def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): if in_dynamic_or_pir_mode(): - place = _current_expected_place() + place = ( + _current_expected_place() + if not in_pir_mode() + else paddle.base.core.Place() + ) if force_cpu: place = core.CPUPlace() if isinstance(shape, (list, tuple)): diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py index e472c70813c73..f28957cdc89be 100644 --- a/test/legacy_test/prim_op_test.py +++ b/test/legacy_test/prim_op_test.py @@ -22,7 +22,8 @@ import paddle from paddle.autograd.ir_backward import grad as ir_grad -from paddle.base import core +from paddle.base import Scope, core +from paddle.base.executor import scope_guard from paddle.base.framework import ( OpProtoHolder, _dygraph_tracer, @@ -409,7 +410,8 @@ def check(self): self.check_jit_comp_with_cinn() else: if self.enable_check_static_comp: - self.check_static_comp() + with scope_guard(Scope()): + self.check_static_comp() def get_kernel_sig(self): with dygraph_guard(): @@ -870,7 +872,8 @@ def check(self): self.check_jit_comp_with_cinn() else: if self.enable_check_static_comp: - self.check_static_comp() + with scope_guard(Scope()): + self.check_static_comp() def get_output_dict(self, np_outputs, api_outputs, outputs_sig): assert len(api_outputs) <= len(outputs_sig), ( diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 8b16ee5750eac..8d1ee1ac5091a 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -693,9 +693,21 @@ def test_check_grad(self): return # TODO(ScottWong98): set `check_prim=False` when `fill_any_like` supports `complex` dtype if self.dtype == np.complex64 or self.dtype == np.complex128: - self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=False) + self.check_grad( + ['X'], + 'Out', + check_prim=False, + check_prim_pir=False, + check_new_ir=False, + ) else: - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) def init_dtype(self): # TODO If dtype is float64, the output (Out) has diff at CPUPlace @@ -1615,7 +1627,9 @@ def if_enable_cinn(self): pass def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output( + check_prim=True, check_new_ir=True, check_prim_pir=True + ) def test_check_grad(self): if self.dtype == np.float16: @@ -1626,6 +1640,7 @@ def test_check_grad(self): max_relative_error=0.0005, check_prim=True, check_new_ir=True, + check_prim_pir=True, ) @@ -2480,12 +2495,22 @@ def setUp(self): self.cinn_atol = 1e-8 def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output( + check_prim=True, + check_new_ir=True, + check_prim_pir=False, + ) def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) class TestGelu(TestActivation): @@ -2518,12 +2543,20 @@ def if_enable_cinn(self): pass def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output( + check_prim=True, check_new_ir=True, check_prim_pir=False + ) def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) class TestGelu_ZeroDim(TestGelu): @@ -3575,12 +3608,20 @@ def if_enable_cinn(self): pass def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output( + check_prim=True, check_prim_pir=True, check_new_ir=True + ) def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) class TestPow_ZeroDim(TestPow): @@ -4397,6 +4438,7 @@ def create_test_act_fp16_class( grad_check=True, check_dygraph=True, check_prim=False, + check_prim_pir=False, enable_cinn=False, grad_atol=1e-2, **kwargs @@ -4425,6 +4467,7 @@ def test_check_output(self): atol=atol, check_dygraph=check_dygraph, check_prim=check_prim, + check_prim_pir=check_prim_pir, ) def test_check_grad(self): @@ -4437,6 +4480,7 @@ def test_check_grad(self): 'Out', check_dygraph=check_dygraph, check_prim=check_prim, + check_prim_pir=check_prim_pir, max_relative_error=grad_atol, ) @@ -4451,7 +4495,9 @@ def test_check_grad(self): create_test_act_fp16_class(TestSigmoid, check_prim=True, enable_cinn=True) create_test_act_fp16_class(TestSilu, check_prim=True, enable_cinn=True) create_test_act_fp16_class(TestLogSigmoid) -create_test_act_fp16_class(TestTanh, check_prim=True, enable_cinn=True) +create_test_act_fp16_class( + TestTanh, check_prim=True, check_prim_pir=True, enable_cinn=True +) create_test_act_fp16_class(TestTanhshrink) create_test_act_fp16_class(TestHardShrink) create_test_act_fp16_class(TestSoftshrink) @@ -4478,6 +4524,7 @@ def test_check_grad(self): create_test_act_fp16_class( TestGelu, check_prim=True, + check_prim_pir=True, check_new_ir=True, enable_cinn=True, rev_comp_rtol=1e-3, @@ -4499,7 +4546,7 @@ def test_check_grad(self): create_test_act_fp16_class(TestLog10) create_test_act_fp16_class(TestLog1p) create_test_act_fp16_class(TestSquare) -create_test_act_fp16_class(TestPow, check_prim=True) +create_test_act_fp16_class(TestPow, check_prim=True, check_prim_pir=True) create_test_act_fp16_class(TestPow_API) create_test_act_fp16_class(TestSTanh) create_test_act_fp16_class(TestSoftplus) @@ -4521,7 +4568,11 @@ def test_check_grad(self): ) create_test_act_fp16_class(TestLeakyRelu_ZeroDim, check_prim=True) create_test_act_fp16_class( - TestRsqrt, check_prim=True, enable_cinn=True, check_new_ir=True + TestRsqrt, + check_prim=True, + enable_cinn=True, + check_new_ir=True, + check_prim_pir=True, ) @@ -4645,7 +4696,9 @@ def test_check_grad(self): create_test_act_bf16_class(TestLeakyReluAlpha2, check_prim=True) create_test_act_bf16_class(TestLeakyReluAlpha3, check_prim=True) create_test_act_bf16_class(TestLeakyRelu_ZeroDim, check_prim=True) -create_test_act_bf16_class(TestRsqrt, check_prim=True, check_new_ir=True) +create_test_act_bf16_class( + TestRsqrt, check_prim=True, check_new_ir=True, check_prim_pir=True +) if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py index 47bc23d76f601..448629431d0b1 100644 --- a/test/legacy_test/test_cast_op.py +++ b/test/legacy_test/test_cast_op.py @@ -52,10 +52,16 @@ def init_shapes(self): self.input_shape = [10, 10] def test_check_output(self): - self.check_output(check_new_ir=True) + self.check_output(check_prim_pir=True, check_new_ir=True) def test_grad(self): - self.check_grad(['X'], ['Out'], check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + ['Out'], + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) class TestCastOpFp32ToFp64_ZeroDim(TestCastOpFp32ToFp64): diff --git a/test/legacy_test/test_concat_op.py b/test/legacy_test/test_concat_op.py index dc9702beeb014..153e1cc06d308 100644 --- a/test/legacy_test/test_concat_op.py +++ b/test/legacy_test/test_concat_op.py @@ -61,18 +61,51 @@ def test_check_grad(self): if self.dtype == np.uint16: place = core.CUDAPlace(0) self.check_grad_with_place( - place, ['x0'], 'Out', check_prim=True, check_new_ir=True + place, + ['x0'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x1'], 'Out', check_prim=True, check_new_ir=True + place, + ['x1'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x2'], 'Out', check_prim=True, check_new_ir=True + place, + ['x2'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, ) else: - self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True) - self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True) - self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) + self.check_grad( + ['x1'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) + self.check_grad( + ['x2'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) def init_test_data(self): if self.dtype == np.uint16: @@ -213,9 +246,27 @@ def test_check_output(self): self.check_output(check_new_ir=True) def test_check_grad(self): - self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True) - self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True) - self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) + self.check_grad( + ['x1'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) + self.check_grad( + ['x2'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) def init_test_data(self): if self.dtype == np.uint16: @@ -301,8 +352,10 @@ def create_test_fp16(parent): class TestConcatFp16(parent): def setUp(self): self.op_type = "concat" + self.prim_op_type = "prim" self.python_api = paddle.concat self.public_python_api = paddle.concat + self.enable_cinn = False self.dtype = self.get_dtype() self.init_test_data() self.inputs = { @@ -332,18 +385,51 @@ def test_check_grad(self): if self.dtype == np.uint16: place = core.CUDAPlace(0) self.check_grad_with_place( - place, ['x0'], 'Out', check_new_ir=True + place, + ['x0'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x1'], 'Out', check_new_ir=True + place, + ['x1'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x2'], 'Out', check_new_ir=True + place, + ['x2'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) else: - self.check_grad(['x0'], 'Out', check_new_ir=True) - self.check_grad(['x1'], 'Out', check_new_ir=True) - self.check_grad(['x2'], 'Out', check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) + self.check_grad( + ['x1'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) + self.check_grad( + ['x2'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) def get_dtype(self): return np.float16 @@ -371,6 +457,7 @@ def create_test_bf16(parent): class TestConcatBf16(parent): def setUp(self): self.op_type = "concat" + self.prim_op_type = "prim" self.python_api = paddle.concat self.public_python_api = paddle.concat self.enable_cinn = False @@ -403,18 +490,51 @@ def test_check_grad(self): if self.dtype == np.uint16: place = core.CUDAPlace(0) self.check_grad_with_place( - place, ['x0'], 'Out', check_new_ir=True + place, + ['x0'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x1'], 'Out', check_new_ir=True + place, + ['x1'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) self.check_grad_with_place( - place, ['x2'], 'Out', check_new_ir=True + place, + ['x2'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, ) else: - self.check_grad(['x0'], 'Out', check_new_ir=True) - self.check_grad(['x1'], 'Out', check_new_ir=True) - self.check_grad(['x2'], 'Out', check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) + self.check_grad( + ['x1'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) + self.check_grad( + ['x2'], + 'Out', + check_new_ir=True, + check_prim=True, + check_prim_pir=True, + ) def get_dtype(self): return np.uint16 diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py index 8bacfc9a45cfd..546e9d2555421 100644 --- a/test/legacy_test/test_elementwise_add_op.py +++ b/test/legacy_test/test_elementwise_add_op.py @@ -57,6 +57,7 @@ def test_check_output(self): self.check_output( check_dygraph=self.check_dygraph(), check_prim=self.check_prim, + check_prim_pir=self.check_dygraph(), check_new_ir=self.check_dygraph(), ) @@ -69,6 +70,7 @@ def test_check_grad_normal(self): 'Out', check_dygraph=self.check_dygraph(), check_prim=self.check_prim, + check_prim_pir=self.check_dygraph(), check_new_ir=self.check_dygraph(), ) @@ -82,6 +84,7 @@ def test_check_grad_ingore_x(self): no_grad_set=set("X"), check_dygraph=self.check_dygraph(), check_prim=self.check_prim, + check_prim_pir=self.check_dygraph(), check_new_ir=self.check_dygraph(), ) @@ -95,6 +98,7 @@ def test_check_grad_ingore_y(self): no_grad_set=set('Y'), check_dygraph=self.check_dygraph(), check_prim=self.check_prim, + check_prim_pir=self.check_dygraph(), check_new_ir=self.check_dygraph(), ) @@ -152,6 +156,7 @@ def test_check_output(self): atol=1e-3, check_dygraph=self.check_dygraph(), check_prim=self.check_prim, + check_prim_pir=self.check_dygraph(), check_new_ir=self.check_dygraph(), ) @@ -167,6 +172,7 @@ def test_check_grad_ingore_x(self): 'Out', no_grad_set=set("X"), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -178,6 +184,7 @@ def test_check_grad_ingore_y(self): 'Out', no_grad_set=set('Y'), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -221,6 +228,7 @@ def test_check_grad_normal(self): ['X', 'Y'], 'Out', check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -232,6 +240,7 @@ def test_check_grad_ingore_x(self): 'Out', no_grad_set=set("X"), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -243,6 +252,7 @@ def test_check_grad_ingore_y(self): 'Out', no_grad_set=set('Y'), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py index 8013eb0baaf15..fde11e09fbe14 100644 --- a/test/legacy_test/test_elementwise_mul_op.py +++ b/test/legacy_test/test_elementwise_mul_op.py @@ -49,6 +49,7 @@ def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_output( check_dygraph=(not self.use_mkldnn), + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -59,6 +60,7 @@ def test_check_grad_normal(self): 'Out', check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -70,6 +72,7 @@ def test_check_grad_ingore_x(self): no_grad_set=set("X"), check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -81,6 +84,7 @@ def test_check_grad_ingore_y(self): no_grad_set=set('Y'), check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -102,6 +106,7 @@ def if_enable_cinn(self): class TestComplexElementwiseMulOpWithCheckGrad(ElementwiseMulOp): def setUp(self): self.op_type = "elementwise_mul" + self.prim_op_type = "prim" self.python_api = paddle.multiply self.public_python_api = paddle.multiply self.dtype = np.complex128 @@ -188,7 +193,13 @@ def test_check_output(self): self.check_output(check_new_ir=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X', 'Y'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) def test_check_grad_ingore_x(self): self.check_grad( @@ -196,6 +207,7 @@ def test_check_grad_ingore_x(self): 'Out', no_grad_set=set("X"), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -205,6 +217,7 @@ def test_check_grad_ingore_y(self): 'Out', no_grad_set=set('Y'), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -420,6 +433,7 @@ def test_check_grad_normal(self): 'Out', check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -431,6 +445,7 @@ def test_check_grad_ingore_x(self): no_grad_set=set("X"), check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -442,6 +457,7 @@ def test_check_grad_ingore_y(self): no_grad_set=set('Y'), check_dygraph=(not self.use_mkldnn), check_prim=True, + check_prim_pir=(not self.use_mkldnn), check_new_ir=(not self.use_mkldnn), ) @@ -496,6 +512,7 @@ def setUp(self): class TestComplexElementwiseMulOp(OpTest): def setUp(self): self.op_type = "elementwise_mul" + self.prim_op_type = "prim" self.python_api = paddle.multiply self.init_base_dtype() self.init_input_output() diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py index e406845960abc..c718ce16292b9 100644 --- a/test/legacy_test/test_elementwise_pow_op.py +++ b/test/legacy_test/test_elementwise_pow_op.py @@ -44,7 +44,7 @@ def test_check_output(self): if hasattr(self, 'attrs'): self.check_output(check_dygraph=False) else: - self.check_output(check_new_ir=True) + self.check_output(check_prim_pir=True, check_new_ir=True) def test_check_grad_normal(self): if hasattr(self, 'attrs'): @@ -53,7 +53,11 @@ def test_check_grad_normal(self): ) else: self.check_grad( - ['X', 'Y'], 'Out', check_prim=True, check_new_ir=True + ['X', 'Y'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, ) @@ -190,6 +194,8 @@ class TestElementwisePowOpInt(OpTest): def setUp(self): self.op_type = "elementwise_pow" self.python_api = paddle.pow + self.public_python_api = paddle.pow + self.prim_op_type = "prim" self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])} self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} @@ -198,7 +204,7 @@ def test_check_output(self): if hasattr(self, 'attrs'): self.check_output(check_dygraph=False) else: - self.check_output(check_new_ir=True) + self.check_output(check_prim_pir=True, check_new_ir=True) class TestElementwisePowGradOpInt(unittest.TestCase): @@ -254,7 +260,7 @@ def test_check_output(self): if hasattr(self, 'attrs'): self.check_output(check_dygraph=False) else: - self.check_output(check_new_ir=True) + self.check_output(check_prim_pir=True, check_new_ir=True) def test_check_grad(self): self.check_grad( @@ -264,6 +270,7 @@ def test_check_grad(self): self.inputs['X'], self.inputs['Y'], 1 / self.inputs['X'].size ), check_prim=True, + check_prim_pir=True, check_new_ir=True, ) @@ -290,7 +297,7 @@ def setUp(self): self.outputs = {'Out': convert_float_to_uint16(out)} def test_check_output(self): - self.check_output(check_new_ir=True) + self.check_output(check_prim_pir=True, check_new_ir=True) def test_check_grad(self): self.check_grad(['X', 'Y'], 'Out') @@ -301,7 +308,7 @@ def test_check_grad(self): 'Out', check_prim=True, only_check_prim=True, - check_new_ir=True, + check_prim_pir=True, ) diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py index b023ff6488e48..3fb01bb3d0b62 100644 --- a/test/legacy_test/test_layer_norm_op.py +++ b/test/legacy_test/test_layer_norm_op.py @@ -141,8 +141,9 @@ def test_check_output(self): no_check_set=["Mean", "Variance"], atol=self.ori_atol, rtol=self.ori_rtol, - check_prim=True, - check_new_ir=True, + check_prim=self.check_prim, + check_prim_pir=self.check_prim_pir, + check_new_ir=self.check_new_ir, ) def test_check_grad(self): @@ -150,8 +151,9 @@ def test_check_grad(self): self.check_grad_input_list, ['Y'], max_relative_error=self.max_relative_error, - check_prim=True, - check_new_ir=True, + check_prim=self.check_prim, + check_prim_pir=self.check_prim_pir, + check_new_ir=self.check_new_ir, ) def initConfig(self): @@ -173,6 +175,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = True + self.check_prim = True + self.check_prim_pir = True + self.check_new_ir = True def initTestCase(self): np.random.seed(123) @@ -240,8 +245,9 @@ def test_check_output(self): no_check_set=["Mean", "Variance"], atol=self.ori_atol, rtol=self.ori_rtol, - check_prim=True, - check_new_ir=True, + check_prim=self.check_prim, + check_prim_pir=self.check_prim_pir, + check_new_ir=self.check_new_ir, ) def test_check_grad(self): @@ -250,8 +256,9 @@ def test_check_grad(self): self.check_grad_input_list, ['Y'], max_relative_error=self.max_relative_error, - check_prim=True, - check_new_ir=True, + check_prim=self.check_prim, + check_prim_pir=self.check_prim_pir, + check_new_ir=self.check_new_ir, ) def initConfig(self): @@ -266,6 +273,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = True + self.check_prim = True + self.check_prim_pir = True + self.check_new_ir = True def initTestCase(self): np.random.seed(123) @@ -335,6 +345,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True @unittest.skipIf( @@ -356,6 +369,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True @unittest.skipIf( @@ -382,6 +398,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True @unittest.skipIf( @@ -403,6 +422,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True @unittest.skipIf( @@ -429,6 +451,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = True + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True @unittest.skipIf( @@ -450,6 +475,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = True + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True class TestLayerNormOpByOpTestFP32(TestLayerNormOpByOpTest): @@ -467,6 +495,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = True + self.check_prim = True + self.check_prim_pir = True + self.check_new_ir = True class TestLayerNormOpByOpTestFP32_case2(TestLayerNormOpByOpTest): @@ -484,6 +515,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True class TestLayerNormOpByOpTestFP32_case3(TestLayerNormOpByOpTest): @@ -501,6 +535,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = True self.has_bias = False + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True class TestLayerNormOpByOpTestFP32_case4(TestLayerNormOpByOpTest): @@ -518,6 +555,9 @@ def initConfig(self): self.begin_norm_axis = 1 self.has_scale = False self.has_bias = True + self.check_prim = False + self.check_prim_pir = False + self.check_new_ir = True class TestLayerNormOp(unittest.TestCase): diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py index c9ab6baf41ef6..0a9132ca55b49 100755 --- a/test/legacy_test/test_reshape_op.py +++ b/test/legacy_test/test_reshape_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci import paddle from paddle import base @@ -43,11 +43,17 @@ def init_data(self): self.new_shape = (12, 10) self.infered_shape = (12, 10) - def test_check_output(self): + def _test_check_output(self): self.check_output(no_check_set=['XShape'], check_new_ir=True) def test_check_grad(self): - self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True) + self.check_grad( + ["X"], + "Out", + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) class TestReshapeOp_ZeroDim1(TestReshapeOp): @@ -120,7 +126,7 @@ def test_check_output(self): self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(["X"], "Out", check_prim=True) + self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True) class TestReshapeFP16Op(OpTest): @@ -148,7 +154,7 @@ def test_check_output(self): self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(["X"], "Out", check_prim=True) + self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True) class TestReshapeOpDimInfer1(TestReshapeOp): @@ -340,6 +346,9 @@ def init_dtype(self): self.dtype = np.uint8 +@skip_check_grad_ci( + "we don't need to check grad for the bool type of reshape op" +) class TestReshapeOpBool(TestReshapeOp): def setUp(self): self.init_data() diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py index 964e127aafb81..92dfe72f8443e 100644 --- a/test/legacy_test/test_split_op.py +++ b/test/legacy_test/test_split_op.py @@ -61,7 +61,11 @@ def test_check_output(self): def test_check_grad(self): self.check_grad( - ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True + ['X'], + ['out0', 'out1', 'out2'], + check_prim=True, + check_prim_pir=True, + check_new_ir=True, ) @@ -117,7 +121,11 @@ def test_check_output(self): def test_check_grad(self): self.check_grad( - ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True + ['X'], + ['out0', 'out1', 'out2'], + check_prim=True, + check_prim_pir=True, + check_new_ir=True, ) @@ -243,7 +251,11 @@ def test_check_output(self): def test_check_grad(self): self.check_grad( - ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True + ['X'], + ['out0', 'out1', 'out2'], + check_prim=True, + check_prim_pir=True, + check_new_ir=True, ) @@ -291,7 +303,12 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) self.check_grad_with_place( - place, ['X'], 'out2', check_prim=True, check_new_ir=True + place, + ['X'], + 'out2', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, ) cls_name = "{}_{}".format(parent.__name__, "BF16Op") diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py index 63a68442936ab..c154625fb51f4 100644 --- a/test/legacy_test/test_sum_op.py +++ b/test/legacy_test/test_sum_op.py @@ -58,11 +58,20 @@ def init_kernel_type(self): self.dtype = np.float64 def test_check_output(self): - self.check_output(check_prim=True, check_cinn=True, check_new_ir=True) + self.check_output( + check_prim=True, + check_cinn=True, + check_new_ir=True, + ) def test_check_grad(self): self.check_grad( - ['x0'], 'Out', check_prim=True, check_cinn=True, check_new_ir=True + ['x0'], + 'Out', + check_prim=True, + check_cinn=True, + check_prim_pir=True, + check_new_ir=True, ) @@ -304,7 +313,13 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_grad(['x0'], 'Out', check_cinn=True, check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_cinn=True, + check_prim_pir=True, + check_new_ir=True, + ) def create_test_sum_fp16_class(parent): @@ -330,7 +345,9 @@ def test_w_is_selected_rows(self): class TestSumBF16Op(OpTest): def setUp(self): self.op_type = "sum" + self.prim_op_type = "prim" self.python_api = paddle.add_n + self.public_python_api = paddle.add_n self.init_kernel_type() x0 = np.random.random((3, 40)).astype(np.float32) x1 = np.random.random((3, 40)).astype(np.float32) @@ -354,7 +371,13 @@ def test_check_output(self): def test_check_grad(self): # new dynamic graph mode does not support unit16 type - self.check_grad(['x0'], 'Out', check_dygraph=False, check_new_ir=True) + self.check_grad( + ['x0'], + 'Out', + check_dygraph=False, + check_prim_pir=True, + check_new_ir=True, + ) class API_Test_Add_n(unittest.TestCase): diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py index c8d91f59f8c49..52f85ef1e0a70 100644 --- a/test/legacy_test/test_transpose_op.py +++ b/test/legacy_test/test_transpose_op.py @@ -52,7 +52,13 @@ def test_check_output(self): self.check_output(no_check_set=['XShape'], check_new_ir=True) def test_check_grad(self): - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_new_ir=True, + check_prim_pir=True, + ) def if_enable_cinn(self): pass @@ -209,7 +215,13 @@ def test_check_output(self): base.core.disable_autotune() def test_check_grad(self): - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) class TestAutoTuneTransposeFP16Op(OpTest): @@ -246,7 +258,13 @@ def test_check_output(self): base.core.disable_autotune() def test_check_grad(self): - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) class TestAutoTuneTransposeBF16Op(OpTest): @@ -290,7 +308,13 @@ def test_check_output(self): base.core.disable_autotune() def test_check_grad(self): - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) class TestTransposeFP16Op(OpTest): @@ -325,7 +349,13 @@ def test_check_output(self): self.check_output(no_check_set=['XShape'], check_new_ir=True) def test_check_grad(self): - self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_prim_pir=True, + check_new_ir=True, + ) def initTestCase(self): self.shape = (3, 40) From 6e5c978878e401b9d383de91078f82520fa40cf1 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Thu, 21 Sep 2023 18:35:15 +0800 Subject: [PATCH 36/39] =?UTF-8?q?=E3=80=90pir=E3=80=91Modify=20comment=20o?= =?UTF-8?q?f=20pr57478=20and=20pr56873=20(#57520)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tmp * reply comment * code style --- .../fluid/pir/dialect/op_generator/api_gen.py | 2 +- .../pir/dialect/op_generator/python_c_gen.py | 2 +- .../pir/dialect/operator/ir/manual_api.cc | 23 ++++++++++--------- .../pir/dialect/operator/ir/manual_api.h | 21 +++++++++-------- .../pir/dialect/operator/ir/manual_op_vjp.cc | 4 +++- .../primitive/backend/manual/manual_backend.h | 1 - 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py index d7e74f72b652f..851f318e9bc47 100644 --- a/paddle/fluid/pir/dialect/op_generator/api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py @@ -150,7 +150,7 @@ def _gen_api_inputs(self, op_info): assert len(name_list) == len(type_list) ret = [] for name, type in zip(name_list, type_list): - ret.append(f'{self._type_map[type]} {name}') + ret.append(f'const {self._type_map[type]}& {name}') return ', '.join(ret) def _gen_api_attrs( diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py index 440f656b99964..adb5270e975e6 100644 --- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py @@ -174,7 +174,7 @@ """ BUILTIN_STACK_OP_TEMPLATE = """ - {name} = paddle::dialect::stack({name}_tmp, 0); + {name} = paddle::dialect::stack({name}_tmp, /*axis*/0); """ TYPE_TO_FUNC_MAP = { "bool": "CastPyArg2Boolean", diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc index 24e7a94b66650..eb5acbf2388ea 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc @@ -28,8 +28,8 @@ pir::OpResult builtin_combine(const std::vector& x) { return combine_op.out(); } -std::vector add_n_grad(std::vector inputs, - pir::Value out_grad) { +std::vector add_n_grad(const std::vector& inputs, + const pir::Value& out_grad) { std::vector inputs_grad; for (size_t i = 0; i < inputs.size(); i++) { paddle::dialect::ScaleOp scale_op = @@ -40,8 +40,8 @@ std::vector add_n_grad(std::vector inputs, return inputs_grad; } -pir::OpResult zeros_like(pir::Value x, - phi::DataType dtype, +pir::OpResult zeros_like(const pir::Value& x, + const phi::DataType dtype, const Place& place) { return paddle::dialect::full_like(x, 0, dtype, place); } @@ -54,7 +54,7 @@ pir::OpResult get_parameter(const std::string& name) { return get_parameter_op.result(0); } -void set_parameter(pir::Value parameter, const std::string& name) { +void set_parameter(const pir::Value& parameter, const std::string& name) { std::unique_ptr param( new pir::Parameter(nullptr, 0, parameter.type())); APIBuilder::Instance().SetParameter(name, std::move(param)); @@ -62,9 +62,9 @@ void set_parameter(pir::Value parameter, const std::string& name) { name); } -pir::OpResult embedding_grad(pir::Value x, - pir::Value weight, - pir::Value out_grad, +pir::OpResult embedding_grad(const pir::Value& x, + const pir::Value& weight, + const pir::Value& out_grad, int64_t padding_idx, bool sparse) { if (weight.type().isa()) { @@ -81,7 +81,8 @@ pir::OpResult embedding_grad(pir::Value x, } } -pir::OpResult split_with_num_grad(std::vector out_grad, int axis) { +pir::OpResult split_with_num_grad(const std::vector& out_grad, + int axis) { auto out_grad_combine_op = APIBuilder::Instance().GetBuilder()->Build(out_grad); paddle::dialect::SplitGradOp split_grad_op = @@ -90,8 +91,8 @@ pir::OpResult split_with_num_grad(std::vector out_grad, int axis) { return split_grad_op.result(0); } -pir::OpResult split_with_num_grad(std::vector out_grad, - pir::Value axis) { +pir::OpResult split_with_num_grad(const std::vector& out_grad, + const pir::Value& axis) { auto out_grad_combine_op = APIBuilder::Instance().GetBuilder()->Build(out_grad); paddle::dialect::SplitGradOp split_grad_op = diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h index c919448f1ddb0..fe579295ad5a0 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h @@ -25,26 +25,27 @@ namespace dialect { pir::OpResult builtin_combine(const std::vector& x); -std::vector add_n_grad(std::vector inputs, - pir::Value out_grad); +std::vector add_n_grad(const std::vector& inputs, + const pir::Value& out_grad); -pir::OpResult zeros_like(pir::Value x, +pir::OpResult zeros_like(const pir::Value& x, phi::DataType dtype = phi::DataType::UNDEFINED, const Place& place = {}); pir::OpResult get_parameter(const std::string& name); -void set_parameter(pir::Value parameter, const std::string& name); +void set_parameter(const pir::Value& parameter, const std::string& name); -pir::OpResult embedding_grad(pir::Value x, - pir::Value weight, - pir::Value out_grad, +pir::OpResult embedding_grad(const pir::Value& x, + const pir::Value& weight, + const pir::Value& out_grad, int64_t padding_idx = -1, bool sparse = false); -pir::OpResult split_with_num_grad(std::vector out_grad, int axis); +pir::OpResult split_with_num_grad(const std::vector& out_grad, + int axis); -pir::OpResult split_with_num_grad(std::vector out_grad, - pir::Value axis); +pir::OpResult split_with_num_grad(const std::vector& out_grad, + const pir::Value& axis); } // namespace dialect } // namespace paddle diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc index b6d131e5411fb..80c13ac89def1 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc @@ -34,7 +34,9 @@ std::vector> AddNOp::Vjp( AddNOp op_obj = op->dyn_cast(); VLOG(6) << "Prepare inputs of add_n_grad"; - + PADDLE_ENFORCE( + op_obj.inputs() != nullptr, + paddle::platform::errors::Fatal("addn op's inputs can't be null")); pir::CombineOp combine_op_obj = op_obj.inputs() .dyn_cast() .owner() diff --git a/paddle/fluid/primitive/backend/manual/manual_backend.h b/paddle/fluid/primitive/backend/manual/manual_backend.h index 16c1facbd5354..3c9340164ac01 100644 --- a/paddle/fluid/primitive/backend/manual/manual_backend.h +++ b/paddle/fluid/primitive/backend/manual/manual_backend.h @@ -18,7 +18,6 @@ #include #include "paddle/phi/api/include/tensor.h" -#include "paddle/utils/optional.h" namespace paddle { namespace primitive { From 69ad1735436555288b1adb88f731cd67ef8240d9 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Thu, 21 Sep 2023 18:39:12 +0800 Subject: [PATCH 37/39] [SOT][3.11] fix eval frame for python 3.11 (#57490) * [SOT] fix eval frame for python 3.11 * fix missing `()` * fix no Paddle_PyInterpreterFrameProxyType in < 3.11 * `Paddle_PyInterpreterFrameProxy` -> `PyInterpreterFrameProxy` * compat for eval_custom_code * clean callback result is None logic * refine internal API name * refine comments --- paddle/fluid/pybind/jit.cc | 364 ++++++++++++++++++++++++++++--------- 1 file changed, 275 insertions(+), 89 deletions(-) diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc index 69b32fca9cd75..688fe7c670370 100644 --- a/paddle/fluid/pybind/jit.cc +++ b/paddle/fluid/pybind/jit.cc @@ -21,7 +21,14 @@ limitations under the License. */ #include #endif #if PY_VERSION_HEX >= 0x030b0000 +#include #include +#define Py_BUILD_CORE // internal/pycore_opcode.h need this macro +#define NEED_OPCODE_TABLES // To get _PyOpcode_Caches and _PyOpcode_Deopt +#include +#undef NEED_OPCODE_TABLES +#undef Py_BUILD_CORE +#include #endif #include @@ -49,64 +56,181 @@ namespace pybind { // that we don't need any modification in eval_frame functions. typedef _PyInterpreterFrame FrameObject; #define CALL_STAT_INC(name) ((void)0) -PyFrameObject *Paddle_PyFrame_New_NoTrack(PyCodeObject *code) { - CALL_STAT_INC(frame_objects_created); - int slots = code->co_nlocalsplus + code->co_stacksize; - PyFrameObject *f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots); - if (f == NULL) { - return NULL; + +// clang-format off +// Define a proxy PyObject to access _PyInterpreterFrame's properties. +// It will be passed as an argument to the eval frame's callback. +typedef struct PyInterpreterFrameProxy { + PyObject_HEAD + _PyInterpreterFrame *frame; +} PyInterpreterFrameProxy; +// clang-format on + +#define DECLARE_PROXY_PROPERTY(name) \ + static PyObject *PyInterpreterFrameProxy_property_##name( \ + PyInterpreterFrameProxy *self, void *closure) { \ + Py_XINCREF(self->frame->name); \ + return reinterpret_cast(self->frame->name); \ + } + +// clang-format off +#define REGISTER_PROXY_PROPERTY(name) \ + { \ + #name, (getter)PyInterpreterFrameProxy_property_##name, nullptr, nullptr, \ + nullptr \ + } +// clang-format on + +DECLARE_PROXY_PROPERTY(f_code) +DECLARE_PROXY_PROPERTY(f_locals) +DECLARE_PROXY_PROPERTY(f_globals) +DECLARE_PROXY_PROPERTY(f_builtins) + +static PyGetSetDef PyInterpreterFrameProxy_properties[] = { + REGISTER_PROXY_PROPERTY(f_code), + REGISTER_PROXY_PROPERTY(f_locals), + REGISTER_PROXY_PROPERTY(f_globals), + REGISTER_PROXY_PROPERTY(f_builtins), + {nullptr} /* Sentinel */ +}; + +// clang-format off +static PyTypeObject PyInterpreterFrameProxyType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "paddle.framework.core.PyInterpreterFrameProxy", + .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, " + "it's only define all properties we need."), + .tp_basicsize = sizeof(PyInterpreterFrameProxy), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_getset = PyInterpreterFrameProxy_properties, +}; +// clang-format on + +PyInterpreterFrameProxy *PyInterpreterFrameProxy_New( + _PyInterpreterFrame *frame) { + PyTypeObject *type = &PyInterpreterFrameProxyType; + PyInterpreterFrameProxy *self = + reinterpret_cast(type->tp_alloc(type, 0)); + if (!self) { + VLOG(7) << "Failed to allocate PyInterpreterFrameProxy"; + return nullptr; } - f->f_back = NULL; - f->f_trace = NULL; - f->f_trace_lines = 1; - f->f_trace_opcodes = 0; - f->f_fast_as_locals = 0; - f->f_lineno = 0; - return f; + self->frame = frame; + return self; } -static inline bool Paddle_PyFrame_IsIncomplete(_PyInterpreterFrame *frame) { - return frame->owner != FRAME_OWNED_BY_GENERATOR && - frame->prev_instr < - _PyCode_CODE(frame->f_code) + frame->f_code->_co_firsttraceable; +// We copy some cpython internal API from cpython project. +// To avoid name conflict, we use "Internal_" prefix to mark them. +static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame, + int opcode, + int oparg) { + // This only works when opcode is a non-quickened form: + assert(_PyOpcode_Deopt[opcode] == opcode); + int check_oparg = 0; + for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code); + instruction < frame->prev_instr; + instruction++) { + int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)]; + check_oparg |= _Py_OPARG(*instruction); + if (check_opcode == opcode && check_oparg == oparg) { + return 1; + } + if (check_opcode == EXTENDED_ARG) { + check_oparg <<= 8; + } else { + check_oparg = 0; + } + instruction += _PyOpcode_Caches[check_opcode]; + } + return 0; } -PyFrameObject *Paddle_PyFrame_MakeAndSetFrameObject( - _PyInterpreterFrame *frame) { - assert(frame->frame_obj == NULL); - PyObject *error_type, *error_value, *error_traceback; - PyErr_Fetch(&error_type, &error_value, &error_traceback); - - PyFrameObject *f = Paddle_PyFrame_New_NoTrack(frame->f_code); - if (f == NULL) { - Py_XDECREF(error_type); - Py_XDECREF(error_value); - Py_XDECREF(error_traceback); - return NULL; // NOLINT +int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) { + /* Merge fast locals into f->f_locals */ + PyObject *locals; + PyObject **fast; + PyCodeObject *co; + locals = frame->f_locals; + if (locals == NULL) { + locals = frame->f_locals = PyDict_New(); + if (locals == NULL) return -1; } - PyErr_Restore(error_type, error_value, error_traceback); - if (frame->frame_obj) { - f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data; // NOLINT - f->f_frame->owner = FRAME_CLEARED; - f->f_frame->frame_obj = f; - Py_DECREF(f); - return frame->frame_obj; + co = frame->f_code; + fast = _PyFrame_GetLocalsArray(frame); + // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt + // here: + int lasti = _PyInterpreterFrame_LASTI(frame); + if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) { + /* Free vars have not been initialized -- Do that */ + PyCodeObject *co = frame->f_code; + PyObject *closure = frame->f_func->func_closure; + int offset = co->co_nlocals + co->co_nplaincellvars; + for (int i = 0; i < co->co_nfreevars; ++i) { + PyObject *o = PyTuple_GET_ITEM(closure, i); + Py_INCREF(o); + frame->localsplus[offset + i] = o; + } + // COPY_FREE_VARS doesn't have inline CACHEs, either: + frame->prev_instr = _PyCode_CODE(frame->f_code); } - assert(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT); - assert(frame->owner != FRAME_CLEARED); - f->f_frame = frame; - frame->frame_obj = f; - return f; -} + for (int i = 0; i < co->co_nlocalsplus; i++) { + _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); + + /* If the namespace is unoptimized, then one of the + following cases applies: + 1. It does not contain free variables, because it + uses import * or is a top-level namespace. + 2. It is a class namespace. + We don't want to accidentally copy free variables + into the locals dict used by the class. + */ + if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) { + continue; + } -static inline PyFrameObject *Paddle_PyFrame_GetFrameObject( - _PyInterpreterFrame *frame) { - assert(!Paddle_PyFrame_IsIncomplete(frame)); - PyFrameObject *res = frame->frame_obj; - if (res != NULL) { - return res; + PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i); + PyObject *value = fast[i]; + if (frame->stacktop) { + if (kind & CO_FAST_FREE) { + // The cell was set by COPY_FREE_VARS. + assert(value != NULL && PyCell_Check(value)); + value = PyCell_GET(value); + } else if (kind & CO_FAST_CELL) { + // Note that no *_DEREF ops can happen before MAKE_CELL + // executes. So there's no need to duplicate the work + // that MAKE_CELL would otherwise do later, if it hasn't + // run yet. + if (value != NULL) { + if (PyCell_Check(value) && + Internal_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) { + // (likely) MAKE_CELL must have executed already. + value = PyCell_GET(value); + } + // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL), + // with the initial value set when the frame was created... + // (unlikely) ...or it was set to some initial value by + // an earlier call to PyFrame_LocalsToFast(). + } + } + } else { + assert(value == NULL); + } + if (value == NULL) { + if (PyObject_DelItem(locals, name) != 0) { + if (PyErr_ExceptionMatches(PyExc_KeyError)) { + PyErr_Clear(); + } else { + return -1; + } + } + } else { + if (PyObject_SetItem(locals, name, value) != 0) { + return -1; + } + } } - return Paddle_PyFrame_MakeAndSetFrameObject(frame); + return 0; } #else @@ -145,37 +269,84 @@ inline static PyObject *eval_frame_default(PyThreadState *tstate, #endif } -// Start a new frame and run code in this frame. -// Execute a piece of code by default frame-hook. -inline static PyObject *eval_custom_code(PyThreadState *tstate, - FrameObject *frame, - PyCodeObject *code, - int throw_flag) { +#if PY_VERSION_HEX >= 0x030b0000 + +inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate, + FrameObject *frame, + PyCodeObject *code, + int throw_flag) { + // Create a new PyInterpreterFrame. Refer to CALL. + // PyInterpreterFrame has a head section calls "specials". It follows + // a contiguous section containing localplus and interpreter stack space. + size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE; + CALL_STAT_INC(frames_pushed); + _PyInterpreterFrame *shadow = reinterpret_cast<_PyInterpreterFrame *>( + malloc(sizeof(PyObject *) * size)); + if (shadow == nullptr) { + VLOG(7) << "Failed to allocate memory for shadow frame."; + return nullptr; + } + // Create a new function object from code object. Refer to MAKE_FUNCTION. + PyFunctionObject *func = reinterpret_cast( + PyFunction_New(reinterpret_cast(code), frame->f_globals)); + _PyFrame_InitializeSpecials(shadow, func, nullptr, code->co_nlocalsplus); + + PyObject **fastlocals_old = frame->localsplus; + PyObject **fastlocals_new = shadow->localsplus; + + for (size_t i = 0; i < code->co_nlocalsplus; ++i) { + fastlocals_new[i] = nullptr; + } + + // The namemap to map the name to index in new frame localsplus. + PyObject *namemap = PyDict_New(); + if (namemap == nullptr) { + VLOG(7) << "Failed to create namemap."; + free(shadow); + return nullptr; + } + for (size_t i = 0; i < code->co_nlocalsplus; ++i) { + PyObject *name = PyTuple_GET_ITEM(code->co_localsplusnames, i); + PyObject *index = PyLong_FromSize_t(i); + PyDict_SetItem(namemap, name, index); + } + for (size_t i = 0; i < frame->f_code->co_nlocalsplus; ++i) { + PyObject *name = PyTuple_GET_ITEM(frame->f_code->co_localsplusnames, i); + PyObject *index = PyDict_GetItem(namemap, name); + if (index == nullptr) { + continue; + } + Py_XINCREF(fastlocals_old[i]); + fastlocals_new[PyLong_AsSize_t(index)] = fastlocals_old[i]; + } + + PyObject *result = eval_frame_default(tstate, shadow, throw_flag); + free(shadow); + Py_DECREF(namemap); + return result; +} + +#else + +inline static PyObject *eval_custom_code_py310_minus(PyThreadState *tstate, + FrameObject *frame, + PyCodeObject *code, + int throw_flag) { Py_ssize_t ncells = 0; Py_ssize_t nfrees = 0; Py_ssize_t nlocals_new = code->co_nlocals; Py_ssize_t nlocals_old = frame->f_code->co_nlocals; -#if PY_VERSION_HEX >= 0x030b0000 - ncells = code->co_ncellvars; - nfrees = code->co_nfreevars; -#else ncells = PyTuple_GET_SIZE(code->co_cellvars); nfrees = PyTuple_GET_SIZE(code->co_freevars); -#endif PyFrameObject *shadow = PyFrame_New(tstate, code, frame->f_globals, nullptr); if (shadow == nullptr) { return nullptr; } -#if PY_VERSION_HEX >= 0x030b0000 - PyObject **fastlocals_old = frame->localsplus; - PyObject **fastlocals_new = shadow->f_frame->localsplus; -#else PyObject **fastlocals_old = frame->f_localsplus; PyObject **fastlocals_new = shadow->f_localsplus; -#endif for (Py_ssize_t i = 0; i < nlocals_old; i++) { Py_XINCREF(fastlocals_old[i]); @@ -187,15 +358,26 @@ inline static PyObject *eval_custom_code(PyThreadState *tstate, fastlocals_new[nlocals_new + i] = fastlocals_old[nlocals_old + i]; } -#if PY_VERSION_HEX >= 0x030b0000 - PyObject *result = eval_frame_default(tstate, shadow->f_frame, throw_flag); -#else PyObject *result = eval_frame_default(tstate, shadow, throw_flag); -#endif Py_DECREF(shadow); return result; } +#endif + +// Start a new frame and run code in this frame. +// Execute a piece of code by default frame-hook. +inline static PyObject *eval_custom_code(PyThreadState *tstate, + FrameObject *frame, + PyCodeObject *code, + int throw_flag) { +#if PY_VERSION_HEX >= 0x030b0000 + return eval_custom_code_py311_plus(tstate, frame, code, throw_flag); +#else + return eval_custom_code_py310_minus(tstate, frame, code, throw_flag); +#endif +} + static PyObject *_custom_eval_frame(PyThreadState *tstate, FrameObject *frame, int throw_flag, @@ -203,13 +385,16 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate, // https://peps.python.org/pep-0558/#fast-locals-proxy-implementation-details // https://devguide.python.org/internals/interpreter/#all-sorts-of-variables #if PY_VERSION_HEX >= 0x030b0000 - // _PyFrame_GetFrameObject(frame) # this function should be the right answer, - // but nm libpython.so | grep _PyFrame_MakeAndSetFrameObject is a `t' symbol, - // which means it's local to library. we will get a link error if we use it. if (frame->owner == FRAME_OWNED_BY_GENERATOR) { return eval_frame_default(tstate, frame, throw_flag); } - if (PyFrame_FastToLocalsWithError(Paddle_PyFrame_GetFrameObject(frame)) < 0) { + // PyFrame_FastToLocalsWithError receives a PyFrameObject, but if we created a + // PyFrameObject from a PyInterpreterFrame, it will changes the original + // PyInterpreterFrame and causes a Segmentation Fault when Fallback to run + // original frame. So we pass a PyInterpreterFrame to + // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we + // copy many code from CPython project into our project. + if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) { #else if (PyFrame_FastToLocalsWithError(frame) < 0) { #endif @@ -236,39 +421,38 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate, eval_frame_callback_set(Py_None); #if PY_VERSION_HEX >= 0x030b0000 - PyObject *args = Py_BuildValue("(O)", Paddle_PyFrame_GetFrameObject(frame)); + PyObject *args = Py_BuildValue("(O)", PyInterpreterFrameProxy_New(frame)); #else PyObject *args = Py_BuildValue("(O)", frame); #endif PyObject *result = PyObject_CallObject(callback, args); Py_DECREF(args); VLOG(7) << "After call eval_frame_function and decrease frame."; - // result: GuardedCode + // class CustomCode(Protocal): + // code: CodeType | None + // disable_eval_frame: bool + // result: CustomCode if (result == nullptr) { // internal exception VLOG(7) << "Error happened."; return nullptr; - } else if (result != Py_None) { + } else { // NOTE: Cache is not supported now PyCodeObject *code = reinterpret_cast( PyObject_GetAttrString(result, "code")); PyObject *disable_eval_frame = PyObject_GetAttrString(result, "disable_eval_frame"); + PyObject *out; + VLOG(7) << "Start eval new frame and code."; if (disable_eval_frame != Py_True) { // Re-enable custom behavior eval_frame_callback_set(callback); - VLOG(7) << "Start eval new frame and code."; - PyObject *out; if (reinterpret_cast(code) != Py_None) { out = eval_custom_code(tstate, frame, code, throw_flag); } else { out = eval_frame_default(tstate, frame, throw_flag); } - Py_DECREF(result); - Py_DECREF(code); - return out; } else { - PyObject *out; if (reinterpret_cast(code) != Py_None) { out = eval_custom_code(tstate, frame, code, throw_flag); } else { @@ -276,14 +460,10 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate, } // Re-enable custom behavior eval_frame_callback_set(callback); - Py_DECREF(result); - Py_DECREF(code); - return out; } - } else { - // Re-enable custom behavior - eval_frame_callback_set(callback); - return eval_frame_default(tstate, frame, throw_flag); + Py_DECREF(result); + Py_DECREF(code); + return out; } } @@ -414,6 +594,12 @@ void BindEvalFrame(pybind11::module *m) { return obj; }, py::arg("callback")); +#if PY_VERSION_HEX >= 0x030b0000 + if (PyType_Ready(&PyInterpreterFrameProxyType) < 0) { + VLOG(7) << "PyInterpreterFrameProxyType has not been ready!"; + } + Py_INCREF(&PyInterpreterFrameProxyType); +#endif } } // namespace pybind From 177c1397ec774a286fa6a203dd0fa249b685d963 Mon Sep 17 00:00:00 2001 From: kangguangli Date: Thu, 21 Sep 2023 20:20:07 +0800 Subject: [PATCH 38/39] [PIR] register fused_attention in pir (#57557) * register fused_attention in pir * fix * fix --- .../pir/dialect/op_generator/ops_api_gen.py | 1 + paddle/fluid/pir/dialect/operator/ir/ops.yaml | 9 + paddle/phi/api/yaml/op_compat.yaml | 35 +++ paddle/phi/infermeta/multiary.cc | 247 ++++++++++++++++++ paddle/phi/infermeta/multiary.h | 49 ++++ test/white_list/new_ir_op_test_white_list | 1 + 6 files changed, 342 insertions(+) diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index 9f04a9b2fd4b2..e11b2ad1c1bf1 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -95,6 +95,7 @@ 'c_allreduce_max', 'c_allgather', 'seed', + "fused_attention", ] diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml index 8babc4635b8fb..d3cbc31c2e490 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml @@ -170,3 +170,12 @@ args : (Tensor i, Tensor x) output : Tensor[](out) backward: write_to_array_grad + +- op: fused_attention + args: (Tensor x, Tensor ln_scale, Tensor ln_bias, Tensor qkv_weight, Tensor qkv_bias, Tensor cache_kv, Tensor src_mask, Tensor out_linear_weight, Tensor out_linear_bias, Tensor ln_scale_2, Tensor ln_bias_2, int num_heads, bool transpose_qkv_wb, bool pre_layer_norm, float epsilon, float attn_dropout_rate, bool is_test, bool attn_dropout_fix_seed, int attn_dropout_seed, str attn_dropout_implementation, float dropout_rate, bool dropout_fix_seed, int dropout_seed, str dropout_implementation, float ln_epsilon, bool add_residual, int ring_id) + output: Tensor(ln_mean), Tensor(ln_var), Tensor(ln_out), Tensor(qkv_out), Tensor(qkv_bias_out), Tensor(transpose_out_2), Tensor(qk_out), Tensor(qktv_out), Tensor(softmax_out), Tensor(attn_dropout_mask_out), Tensor(attn_dropout_out), Tensor(src_mask_out), Tensor(fmha_out), Tensor(out_linear_out), Tensor(dropout_mask_out), Tensor(ln_mean_2), Tensor(ln_var_2), Tensor(bias_dropout_residual_out), Tensor(cache_kv_out), Tensor(out) + kernel: + func: fused_attention + infer_meta: + func: FusedAttentionInferMeta + optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2 diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 8a85147a66da0..63093631e4347 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -1181,6 +1181,41 @@ data_type : float support_tensor : true +- op : fused_attention + inputs: + x: X + ln_scale: LnScale + ln_bias: LnBias + qkv_weight: QKVW + qkv_bias: QKVBias + cache_kv: CacheKV + src_mask: SrcMask + out_linear_weight: OutLinearW + out_linear_bias: OutLinearBias + ln_scale_2: Ln2Scale + ln_bias_2: Ln2Bias + outputs: + ln_mean: LnMean + ln_var: LnVariance + ln_out: LnOut + qkv_out: QKVOut + qkv_bias_out: QKVBiasOut + transpose_out_2: TransposeOut2 + qk_out: QKOut + qktv_out: QKTVOut + softmax_out: SoftmaxOut + attn_dropout_mask_out: AttnDropoutMaskOut + attn_dropout_out: AttnDropoutOut + src_mask_out: SrcMaskOut + fmha_out: FMHAOut + out_linear_out: OutLinearOut + dropout_mask_out: DropoutMaskOut + ln_mean_2: Ln2Mean + ln_var_2: Ln2Variance + bias_dropout_residual_out: BiasDropoutResidualOut + cache_kv_out: CacheKVOut + out: Y + - op : fused_batch_norm_act backward : fused_batch_norm_act_grad inputs: diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 8de465867273c..6b09dd22db263 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1604,6 +1604,253 @@ void FusedBiasActInferMeta(const MetaTensor& x, out->set_layout(x.layout()); } +void FusedAttentionInferMeta(const MetaTensor& x, + const MetaTensor& ln_scale, + const MetaTensor& ln_bias, + const MetaTensor& qkv_weight, + const MetaTensor& qkv_bias, + const MetaTensor& cache_kv, + const MetaTensor& src_mask, + const MetaTensor& out_linear_weight, + const MetaTensor& out_linear_bias, + const MetaTensor& ln_scale_2, + const MetaTensor& ln_bias_2, + int num_heads, + bool transpose_qkv_wb, + bool pre_layer_norm, + float epsilon, + float attn_dropout_rate, + bool is_test, + bool attn_dropout_fix_seed, + int attn_dropout_seed, + const std::string& attn_dropout_implementation, + float dropout_rate, + bool dropout_fix_seed, + int dropout_seed, + const std::string& dropout_implementation, + float ln_epsilon, + bool add_residual, + int ring_id, + MetaTensor* ln_mean, + MetaTensor* ln_var, + MetaTensor* ln_out, + MetaTensor* qkv_out, + MetaTensor* qkv_bias_out, + MetaTensor* transpose_out_2, + MetaTensor* qk_out, + MetaTensor* qktv_out, + MetaTensor* softmax_out, + MetaTensor* attn_dropout_mask_out, + MetaTensor* attn_dropout_out, + MetaTensor* src_mask_out, + MetaTensor* fmha_out, + MetaTensor* out_linear_out, + MetaTensor* dropout_mask_out, + MetaTensor* ln_mean_2, + MetaTensor* ln_var_2, + MetaTensor* bias_dropout_residual_out, + MetaTensor* cache_kv_out, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + auto y_dim = qkv_weight.dims(); + + int dim_head = 0; + int hidden_size = 0; + int nranks = 1; + if (transpose_qkv_wb) { + PADDLE_ENFORCE_EQ(y_dim.size(), + 2, + phi::errors::InvalidArgument( + "The dimensions of qkv_weight must be 2 if enable" + "transpose_qkv_wb: (dim_embed, 3 * dim_embed)," + "but received dimensions of" + "Input is [%d]", + y_dim.size())); + PADDLE_ENFORCE_GT(num_heads, + 0, + phi::errors::InvalidArgument( + "The num_heads must be provided and greater than 0 " + "if enable transpose_qkv_wb, but we got %d.", + num_heads)); + PADDLE_ENFORCE_EQ(y_dim[0] % num_heads, + 0, + phi::errors::InvalidArgument( + "First dim of qkv_w must be divisible by num heads " + "if enable transpose_qkv_wb, but receive first " + "dim of qkv_w is %d and num_heads is %d.", + y_dim[0], + num_heads)); + if (ring_id == -1) { + PADDLE_ENFORCE_EQ( + y_dim[0] * 3, + y_dim[1], + phi::errors::InvalidArgument("The dimensions of qkv_weight must be 2" + "(dim_embed, 3 * dim_embed).")); + } else { + // compute the mp nranks + nranks = (y_dim[0] * 3) / y_dim[1]; + } + dim_head = y_dim[0] / (num_heads * nranks); + hidden_size = y_dim[0]; + } else { + PADDLE_ENFORCE_EQ(y_dim.size(), + 4, + phi::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4 if not" + "enable transpose_qkv_wb: (3, num_head, dim_head, " + "dim_embed), but received [%d]", + y_dim.size())); + PADDLE_ENFORCE_EQ( + y_dim[0], + 3, + phi::errors::InvalidArgument("First dim of qkv_w must be 3 if disable " + "transpose_qkv_wb, but we got %d.", + y_dim[0])); + if (ring_id == -1) { + PADDLE_ENFORCE_EQ( + y_dim[1] * y_dim[2], + y_dim[3], + phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } + num_heads = y_dim[1]; + dim_head = y_dim[2]; + hidden_size = y_dim[3]; + } + + PADDLE_ENFORCE_EQ( + x_dim.size(), + 3, + phi::errors::InvalidArgument("The dimensions of x must be 3" + "(batch_size, seq_len, dim_embed)," + "but received dimensions of" + "Input is [%d]", + x_dim.size())); + + PADDLE_ENFORCE_EQ(x_dim[2], + hidden_size, + phi::errors::InvalidArgument( + "ShapeError: the dimension of x_dim[2] and y_dim[3] " + "(y_dim[1] if enable transpose_qkv_w) " + "must be equal. But received: the shape " + "of input x = [%s], and the shape of " + "input qkv_weight = [%s]", + x_dim, + y_dim)); + + if (pre_layer_norm) { + ln_mean->set_dims({x_dim[0] * x_dim[1]}); + ln_var->set_dims({x_dim[0] * x_dim[1]}); + ln_out->set_dims(x.dims()); + } else { + ln_mean_2->set_dims({x_dim[0] * x_dim[1]}); + ln_var_2->set_dims({x_dim[0] * x_dim[1]}); + bias_dropout_residual_out->set_dims(x.dims()); + } + + if (transpose_qkv_wb) { + // [batch_size, seq_len, 3 * num_heads * dim_head] + qkv_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head}); + + if (qkv_bias) { + qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head}); + } + } else { + // [batch_size, seq_len, 3, num_head, head_size] + qkv_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head}); + + if (qkv_bias) { + qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head}); + } + } + + // [3, batch_size, num_head, seq_len, head_size] + transpose_out_2->set_dims({3, x_dim[0], num_heads, x_dim[1], dim_head}); + + // cache_seq_len + seq_len if cache else seq_len + auto out_seq_len = x_dim[1]; + if (cache_kv) { + // [2, batch_size, num_head, cache_seq_len, head_size] + auto c_dim = cache_kv.dims(); + + PADDLE_ENFORCE_EQ( + c_dim.size(), + 5, + phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d", + c_dim.size())); + PADDLE_ENFORCE_EQ(c_dim[0], + 2, + phi::errors::InvalidArgument( + "The first dim of CacheKV must be 2, but got %d", + c_dim[0])); // 2 + PADDLE_ENFORCE_EQ(c_dim[1], + x_dim[0], + phi::errors::InvalidArgument( + "The second dim of CacheKV must be equal with " + "batch size %d, but got %d", + x_dim[0], + c_dim[1])); // batch_size + PADDLE_ENFORCE_EQ(c_dim[2], + num_heads, + phi::errors::InvalidArgument( + "The third dim of CacheKV must be equal with num " + "head %d, but got %d", + num_heads, + c_dim[2])); // num_head + // In compile stage, input seq_len can be -1, in that case + // c_dim[3] may < 0 in while + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + c_dim[3], + 0, + phi::errors::InvalidArgument( + "The forth dim of CacheKV must be greater than 0, but got %d", + c_dim[3])); // cache_seq_len + } + + PADDLE_ENFORCE_EQ(c_dim[4], + dim_head, + phi::errors::InvalidArgument( + "The fifth dim of CacheKV must be equal with head " + "size %d, but got %d", + dim_head, + c_dim[4])); // head_size + + out_seq_len += c_dim[3]; + // [3, batch_size, num_head, cache_seq_len + seq_len, head_size] + cache_kv_out->set_dims( + {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]}); + } + // [batch, num_head, seq_len, out_seq_len] + qk_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len}); + + if (src_mask) { + src_mask_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len}); + } + // the same as QKOut's shape. + attn_dropout_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len}); + if (is_test) { + attn_dropout_mask_out->set_dims( + {x_dim[0], num_heads, x_dim[1], out_seq_len}); + } + softmax_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len}); + // [batch_size, num_heads, seq_len, head_dim] + qktv_out->set_dims({x_dim[0], num_heads, x_dim[1], dim_head}); + // [batch_size, seq_len, number of heads*head size] + fmha_out->set_dims({x_dim[0], x_dim[1], num_heads, dim_head}); + + out_linear_out->set_dims(x.dims()); + + if (is_test == false) { + dropout_mask_out->set_dims(x.dims()); + } + + out->set_dims(x.dims()); +} + void FusedLayerNormInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& residual, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index ee62d6d51d655..aaa4787968538 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -334,6 +334,55 @@ void FusedBiasActInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void FusedAttentionInferMeta(const MetaTensor& x, + const MetaTensor& ln_scale, + const MetaTensor& ln_bias, + const MetaTensor& qkv_weight, + const MetaTensor& qkv_bias, + const MetaTensor& cache_kv, + const MetaTensor& src_mask, + const MetaTensor& out_linear_weight, + const MetaTensor& out_linear_bias, + const MetaTensor& ln_scale_2, + const MetaTensor& ln_bias_2, + int num_heads, + bool transpose_qkv_wb, + bool pre_layer_norm, + float epsilon, + float attn_dropout_rate, + bool is_test, + bool attn_dropout_fix_seed, + int attn_dropout_seed, + const std::string& attn_dropout_implementation, + float dropout_rate, + bool dropout_fix_seed, + int dropout_seed, + const std::string& dropout_implementation, + float ln_epsilon, + bool add_residual, + int ring_id, + MetaTensor* ln_mean, + MetaTensor* ln_var, + MetaTensor* ln_out, + MetaTensor* qkv_out, + MetaTensor* qkv_bias_out, + MetaTensor* transpose_out_2, + MetaTensor* qk_out, + MetaTensor* qktv_out, + MetaTensor* softmax_out, + MetaTensor* attn_dropout_mask_out, + MetaTensor* attn_dropout_out, + MetaTensor* src_mask_out, + MetaTensor* fmha_out, + MetaTensor* out_linear_out, + MetaTensor* dropout_mask_out, + MetaTensor* ln_mean_2, + MetaTensor* ln_var_2, + MetaTensor* bias_dropout_residual_out, + MetaTensor* cache_kv_out, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void FusedLayerNormInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& residual, diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list index b85c88fa6bb18..3dc336a187718 100644 --- a/test/white_list/new_ir_op_test_white_list +++ b/test/white_list/new_ir_op_test_white_list @@ -88,6 +88,7 @@ test_fmax_op test_fmin_op test_fold_op test_frame_op +test_fused_attention_op_api test_gather_tree_op test_gaussian_random_op test_generate_proposals_v2_op From 85622b9d6c543c758f6405454ccd403e2ba9aa7d Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 22 Sep 2023 02:50:35 +0000 Subject: [PATCH 39/39] fix merge conflict --- .../dialect/op_generator/op_creator_drr_gen.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py index 5f75063668ee0..01512a7d5b38d 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py @@ -79,9 +79,14 @@ def parse_yaml(self, op_yaml_files, op_compat_yaml_file): op_yaml_items = op_yaml_items + ops op_info_items = [] for op in op_yaml_items: - op_info_items.append( - OpInfoParser(op, op_compat_parser.get_compat(op['name'])) - ) + op_compat_item = op_compat_parser.get_compat(op['name']) + if ( + op_compat_item is not None + and op_compat_item['op'] == "pow" + and 'scalar' in op_compat_item + ): + op_compat_item = op_compat_item.pop('scalar') + op_info_items.append(OpInfoParser(op, op_compat_item)) return op_info_items def gen_cpp_file_code(self, cpp_file_path): @@ -117,9 +122,7 @@ def gen_cpp_file_code(self, cpp_file_path): if len(op_info_item.attribute_name_list) > len( op_info_item.mutable_attribute_name_list ): - # TODO(zyfncg): Currently Op::Build Interface doesn't support this case. - continue - # params_with_mutable_attr.append("attrs") + params_with_mutable_attr.append("attrs") body_code += MUTABLE_ATTR_FUNCTION_TEMPLATE.format( op_name=ir_op_name,