From 7d2e856ec0d1af7fb5b0cbe561574cab7ebb6db9 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 14 Sep 2023 02:52:59 +0800
Subject: [PATCH 01/16] add copysign rfc

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 257 ++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 rfcs/APIs/20230914_api_design_for_copysign.md
diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
new file mode 100644
index 000000000..d320f5efc
--- /dev/null
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -0,0 +1,257 @@
+# paddle.copysign 设计文档
+
+| API 名称     | paddle.copysign                  |
+| ------------ | -------------------------------- |
+| 提交作者     | coco                             |
+| 提交时间     | 2023-09-14                       |
+| 版本号       | V1.0                             |
+| 依赖飞桨版本 | develop                          |
+| 文件名       | 20230914_api_defign_for_copysign |
+
+# 一、概述
+
+## 1、相关背景
+
+为了提升飞桨API丰富度，Paddle需要扩充API，调用路径为：
+
+- paddle.copysign 作为独立的函数调用，非 inplace
+- paddle.copysign_，作为独立的函数，inplace 地修改输入；
+- Tensor.copysign做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_做为 Tensor 的方法使用， inplace 修改输入；
+
+## 2、功能目标
+
+根据两个输入逐元素地计算结果张量，其结果由第一个输入的绝对值大小及第二个输入的符号组成。
+
+## 3、意义
+
+飞桨支持直接通过张量进行批量正负符号复制
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API `torch.copysign(input, other, *, out=None) → [Tensor]` 以及对应的`torch.Tensor.copysign`
+
+在PyTorch中介绍为：
+
+```
+Create a new floating-point tensor with the magnitude of input and the sign of other, elementwise.
+ 
+Supports broadcasting to a common shape, and integer and float inputs.
+```
+
+## 实现方法
+
+从实现方法上，PyTorch是通过c++实现的，[CPU kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L1148-L1158)
+
+```cpp
+void copysign_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
+    cpu_kernel_vec(iter,
+      [](scalar_t a, scalar_t b) -> scalar_t {
+        return c10::copysign(a, b);
+      },
+      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
+        return a.copysign(b);
+      });
+  });
+}
+```
+
+在c10 namespace中，[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/util/copysign.h#L12-L15)：
+
+```cpp
+namespace c10 {
+
+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template <typename T, typename U>
+inline auto copysign(const T& a, const U& b) {
+  return std::copysign(a, b);
+}
+...
+} // namespace c10
+```
+
+[cuda kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/CopysignKernel.cu#L23-L29)
+
+```cpp
+namespace at::native {
+
+void copysign_kernel_cuda(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return c10::cuda::compat::copysign(a, b);
+    });
+  });
+}
+
+REGISTER_DISPATCH(copysign_stub, &copysign_kernel_cuda);
+
+} // namespace at::native
+```
+
+namespace中的`copysign`调用，[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDAMathCompat.h#L46-L65)
+
+```cpp
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+```
+
+方法都是底层cpp调用copysign函数
+
+
+
+**反向backward:**
+
+算子配置[代码位置](https://github.com/pytorch/pytorch/blob/main/tools/autograd/derivatives.yaml#L474-L481C28)
+
+```yaml
+- name: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  other: zeros_like(other)
+  result: copysign_tensor_self_backward(self_t, self_p, result)
+
+- name: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  result: auto_element_wise
+```
+
+backward 反向[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/csrc/autograd/FunctionsManual.cpp#L94-L101)
+
+```cpp
+Tensor copysign_tensor_self_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result) {
+  auto ratio = result / self;
+  ratio.masked_fill_(self == 0, 0);
+  return grad * ratio;
+}
+```
+
+## TensorFlow
+
+无`copysign`实现
+
+## Numpy
+
+numpy.**copysign**(*x1*, *x2*, */*, *out=None*, ***, *where=True*, *casting='same_kind'*, *order='K'*, *dtype=None*, *subok=True*[, *signature*, *extobj*]) *= <ufunc 'copysign'>*
+
+Change the sign of x1 to that of x2, element-wise.If *x2* is a scalar, its sign will be copied to all elements of *x1*.
+
+### 实现方法
+
+先模板生成函数，底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/src/umath/loops.c.src#L1213-L1221)
+
+```
+NPY_NO_EXPORT void
+@TYPE@_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1)= npy_copysign@c@(in1, in2);
+    }
+}
+```
+
+实际调用cpp的math库[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/include/numpy/npy_math.h#L199)
+
+```cpp
+#include <math.h>
+
+...
+#define npy_copysign copysign
+...
+```
+
+
+
+# 四、对比分析
+
+PyTorch和Numpy实现方式基本一致，都是底层调用cpp的math库实现`copysign`，PyTorch可进行backward。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为:
+
+- paddle.copysign(x, y) 作为独立的函数调用，非 inplace
+- paddle.copysign_(x, y)，作为独立的函数，inplace 地修改输入；
+- Tensor.copysign(y)做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_(y)做为 Tensor 的方法使用， inplace 修改输入；
+
+其中
+
++ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor
++ y(Tensor, int, float 等 number)
+
+## 底层OP设计
+
+参考PyTorch与Numpy中的设计，调用底层cpp实现OP
+
+## API实现方案
+
+1. 配置算子的yaml，注意配置inplace
+2. 实现`CopySignInferMeta`，在调用kernel之前计算好`out`的`shape`和`dtype`
+3. 实现`CopySignKernel`的CPU和GPU代码以及forward、backward
+4. 封装Python的API，支持动态图和静态图，编写文档
+5. 编写单测
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
++ **编程范式场景**：常规覆盖动态图和静态图的测试场景
+
++ **硬件场景**：常规需覆盖 CPU、GPU 两种测试场景
++ **参数组合场景**：常规覆盖 API 的全部入参，需要对全部入参进行参数有效性和边界值测试，同时可选参数也需有相应的测试覆盖
++ **计算精度**：需要保证前向计算、反向计算的精度正确性
+  + 前向计算：通过 numpy 实现的函数的对比结果
+  + 反向计算：通过 numpy 推导，计算反向结果的正确性
++ **维度测试**：Paddle API 支持的最低维度为 0 维，单测中应编写相应的 0 维尺寸测试 case
++ **边界测试**：y为0、+0、-0时，测试与numpy结果的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.copysign.html?highlight=copysign#torch.copysign)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.copysign.html#numpy-copysign)
\ No newline at end of file

From 4c0022db6fef9cd3c0029f630a265bc9ac64caed Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 12:02:14 +0800
Subject: [PATCH 02/16] fix input args, add backward kernel, fix python api

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 50 ++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index d320f5efc..68a47cb3e 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -202,19 +202,55 @@ PyTorch和Numpy实现方式基本一致，都是底层调用cpp的math库实现`
 
 API的设计为:
 
-- paddle.copysign(x, y) 作为独立的函数调用，非 inplace
-- paddle.copysign_(x, y)，作为独立的函数，inplace 地修改输入；
-- Tensor.copysign(y)做为 Tensor 的方法使用，非 inplace;
-- Tensor.copysign_(y)做为 Tensor 的方法使用， inplace 修改输入；
+- paddle.copysign(x, y, name=None) 作为独立的函数调用，非 inplace;
+- paddle.copysign_(x, y, name=None)，作为独立的函数，inplace 地修改输入;
+- Tensor.copysign(y, name=None)做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_(y, name=None)做为 Tensor 的方法使用， inplace 修改输入;
 
 其中
 
-+ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor
-+ y(Tensor, int, float 等 number)
++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `int32`、`int64`、`float32`、`float64`
 
 ## 底层OP设计
 
-参考PyTorch与Numpy中的设计，调用底层cpp实现OP
+参考PyTorch与Numpy中的设计，调用底层cpp实现OP，反向 kernel impl 大致如下：
+
+```cpp
+template<typename T>
+struct CopySignGradFunctor {
+    CopySignGradFunctor(const T* x_data, const T* y_data, const T* dout, T* dx, int64_t numel)
+    : x_data_(x_data), y_data_(y_data), dout_(dout), dx_(dx), numel_(numel) {}
+
+    // backward 逻辑如下
+    HOSTDEVICE void operator()(int64_t idx) const {
+        if (x_data_[idx] == T(0)) dx_[idx] = T(0);
+        else dx_[idx] = T(dout_[idx]) * (T(std::copysign(x_data_[idx], y_data_[idx]) / x_data_[idx]));
+    }
+
+    const T* x_data_;
+    const T* y_data_;
+    const T* dout_;
+    T* dx_;
+    int64_t numel_;
+};
+
+template <typename T, typename Context>
+void CopySignGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_data = x.data<T>(), y_data = y.data<T>(), out_grad_data = out_grad.data<T>();
+    auto x_grad_data = x_grad->data<T>();
+    phi::funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+    phi::CopySignGradFunctor<T> functor(x_data, y_data, out_grad_data, x_grad_data, x.numel());
+    for_range(functor);
+}
+```
+
+
 
 ## API实现方案
 

From 5fff675576d71b8f17f07f1cad7bbd28a9c0412e Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 14:37:32 +0800
Subject: [PATCH 03/16] fix types

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index 68a47cb3e..742299e43 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -209,8 +209,8 @@ API的设计为:
 
 其中
 
-+ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64`
-+ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `int32`、`int64`、`float32`、`float64`
++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
 
 ## 底层OP设计
 

From bfa46fec6518b3784294cfd2ad6faaeb3c814f1e Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 14:56:00 +0800
Subject: [PATCH 04/16] fix Number types

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index 742299e43..f2a816d4d 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -210,7 +210,7 @@ API的设计为:
 其中
 
 + x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
-+ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`；为 Number 时，支持 `bool`、`int`、`float`
 
 ## 底层OP设计
 

From 72fbcfc9ede2814b78780372175eaf35402187b9 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Wed, 27 Sep 2023 06:40:36 +0800
Subject: [PATCH 05/16] add pdist api design

---
 rfcs/20230926_api_design_for_pdist.md | 409 ++++++++++++++++++++++++++
 1 file changed, 409 insertions(+)
 create mode 100644 rfcs/20230926_api_design_for_pdist.md

diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/20230926_api_design_for_pdist.md
new file mode 100644
index 000000000..4ffe30875
--- /dev/null
+++ b/rfcs/20230926_api_design_for_pdist.md
@@ -0,0 +1,409 @@
+# paddle.pdist设计文档
+
+| API 名称     | paddle.pdist                  |
+| ------------ | ----------------------------- |
+| 提交作者     | coco                          |
+| 提交时间     | 2023-09-26                    |
+| 版本号       | V1.0                          |
+| 依赖飞桨版本 | develop                       |
+| 文件名       | 20230926_api_defign_for_pdist |
+
+# 一、概述
+
+## 1、相关背景
+
+为paddle新增该API，为计算N个向量两两之间的p-norm距离。
+
+## 2、功能目标
+
+一个矩阵`A`的大小为`MxN`，那么`B=pdist(A)`得到的矩阵B的大小为1行`M*(M-1)/2`列，表示的意义是M行数据，每两行计算一下p-norm距离，默认欧式距离。例如a = [[0.0, 1.0],[2.0,3.0],[4.0,5.0],[6.0,7.0]]，输出为[2.8284, 5.6569, 8.4853, 2.8284, 5.6569, 2.8284]。输出顺序为distance(第一行,第二行), distance(第一行,第三行), ... distance(第二行,第三行)...
+
+## 3、意义
+
+飞桨支持直接两两计算向量间的距离。
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## Scipy
+
+Scipy中有API`scipy.spatial.distance.pdist`
+
+在Scipy中介绍为：
+
+```
+Pairwise distances between observations in n-dimensional space.
+```
+
+## 实现方法
+
+从实现方法上，Scipy是通过py实现的，[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L2195-L2233)
+
+```python
+    X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
+                           check_finite=False)
+
+    s = X.shape
+    if len(s) != 2:
+        raise ValueError('A 2-dimensional array must be passed.')
+
+    m, n = s
+
+    if callable(metric):
+        mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
+        metric_info = _METRIC_ALIAS.get(mstr, None)
+
+        if metric_info is not None:
+            X, typ, kwargs = _validate_pdist_input(
+                X, m, n, metric_info, **kwargs)
+
+        return _pdist_callable(X, metric=metric, out=out, **kwargs)
+    elif isinstance(metric, str):
+        mstr = metric.lower()
+        metric_info = _METRIC_ALIAS.get(mstr, None)
+
+        if metric_info is not None:
+            pdist_fn = metric_info.pdist_func
+            _extra_windows_error_checks(X, out, (m * (m - 1) / 2,), **kwargs)
+            return pdist_fn(X, out=out, **kwargs)
+        elif mstr.startswith("test_"):
+            metric_info = _TEST_METRICS.get(mstr, None)
+            if metric_info is None:
+                raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
+            X, typ, kwargs = _validate_pdist_input(
+                X, m, n, metric_info, **kwargs)
+            return _pdist_callable(
+                X, metric=metric_info.dist_func, out=out, **kwargs)
+        else:
+            raise ValueError('Unknown Distance Metric: %s' % mstr)
+    else:
+        raise TypeError('2nd argument metric must be a string identifier '
+                        'or a function.')
+```
+
+先找到`mertric`对应的函数，然后call调用，例如`metric`为`euclidean`时，调用`euclidean`的函数。[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L1781C1-L1787C7)
+
+
+
+```python
+    MetricInfo(
+        canonical_name='euclidean',
+        aka={'euclidean', 'euclid', 'eu', 'e'},
+        dist_func=euclidean,
+        cdist_func=_distance_pybind.cdist_euclidean,
+        pdist_func=_distance_pybind.pdist_euclidean,
+    ),
+```
+
+[euclidean调用minkowski](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L500-L536)和[minkowski实现](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L429-L497)
+
+```python
+def euclidean(u, v, w=None):
+    return minkowski(u, v, p=2, w=w)
+
+
+def minkowski(u, v, p=2, w=None):
+    u = _validate_vector(u)
+    v = _validate_vector(v)
+    if p <= 0:
+        raise ValueError("p must be greater than 0")
+    u_v = u - v
+    if w is not None:
+        w = _validate_weights(w)
+        if p == 1:
+            root_w = w
+        elif p == 2:
+            # better precision and speed
+            root_w = np.sqrt(w)
+        elif p == np.inf:
+            root_w = (w != 0)
+        else:
+            root_w = np.power(w, 1/p)
+        u_v = root_w * u_v
+    dist = norm(u_v, ord=p)
+    return dist
+```
+
+主要是调用`norm`实现计算
+
+```python
+def norm(x, ord=None, axis=None):
+    if not issparse(x):
+        raise TypeError("input is not sparse. use numpy.linalg.norm")
+
+    # Check the default case first and handle it immediately.
+    if axis is None and ord in (None, 'fro', 'f'):
+        return _sparse_frobenius_norm(x)
+
+    # Some norms require functions that are not implemented for all types.
+    x = x.tocsr()
+
+    if axis is None:
+        axis = (0, 1)
+    elif not isinstance(axis, tuple):
+        msg = "'axis' must be None, an integer or a tuple of integers"
+        try:
+            int_axis = int(axis)
+        except TypeError as e:
+            raise TypeError(msg) from e
+        if axis != int_axis:
+            raise TypeError(msg)
+        axis = (int_axis,)
+
+    nd = 2
+    if len(axis) == 2:
+        row_axis, col_axis = axis
+        if not (-nd <= row_axis < nd and -nd <= col_axis < nd):
+            raise ValueError('Invalid axis %r for an array with shape %r' %
+                             (axis, x.shape))
+        if row_axis % nd == col_axis % nd:
+            raise ValueError('Duplicate axes given.')
+        if ord == 2:
+            # Only solver="lobpcg" supports all numpy dtypes
+            _, s, _ = svds(x, k=1, solver="lobpcg")
+            return s[0]
+        elif ord == -2:
+            raise NotImplementedError
+            #return _multi_svd_norm(x, row_axis, col_axis, amin)
+        elif ord == 1:
+            return abs(x).sum(axis=row_axis).max(axis=col_axis)[0,0]
+        elif ord == np.inf:
+            return abs(x).sum(axis=col_axis).max(axis=row_axis)[0,0]
+        elif ord == -1:
+            return abs(x).sum(axis=row_axis).min(axis=col_axis)[0,0]
+        elif ord == -np.inf:
+            return abs(x).sum(axis=col_axis).min(axis=row_axis)[0,0]
+        elif ord in (None, 'f', 'fro'):
+            # The axis order does not matter for this norm.
+            return _sparse_frobenius_norm(x)
+        else:
+            raise ValueError("Invalid norm order for matrices.")
+    elif len(axis) == 1:
+        a, = axis
+        if not (-nd <= a < nd):
+            raise ValueError('Invalid axis %r for an array with shape %r' %
+                             (axis, x.shape))
+        if ord == np.inf:
+            M = abs(x).max(axis=a)
+        elif ord == -np.inf:
+            M = abs(x).min(axis=a)
+        elif ord == 0:
+            # Zero norm
+            M = (x != 0).sum(axis=a)
+        elif ord == 1:
+            # special case for speedup
+            M = abs(x).sum(axis=a)
+        elif ord in (2, None):
+            M = sqrt(abs(x).power(2).sum(axis=a))
+        else:
+            try:
+                ord + 1
+            except TypeError as e:
+                raise ValueError('Invalid norm order for vectors.') from e
+            M = np.power(abs(x).power(ord).sum(axis=a), 1 / ord)
+        if hasattr(M, 'toarray'):
+            return M.toarray().ravel()
+        elif hasattr(M, 'A'):
+            return M.A.ravel()
+        else:
+            return M.ravel()
+    else:
+        raise ValueError("Improper number of dimensions to norm.")
+```
+
+
+
+
+
+
+
+
+
+## PyTorch
+
+Parameters:
+
+- **input** – input tensor of shape N×M.
+- **p** – p value for the p-norm distance to calculate between each vector pair ∈[0,∞]∈[0,∞].
+
+并且有相关描述：
+
+This function is equivalent to `scipy.spatial.distance.pdist(input, 'minkowski', p=p)` if p∈(0,∞). When p=0 it is equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`. When p=∞, the closest scipy function is `scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`.
+
+
+
+相关[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L58-L64)
+
+```cpp
+Tensor pdist(const Tensor& self, const double p) {
+  TORCH_CHECK(self.dim() == 2,
+      "pdist only supports 2D tensors, got: ", self.dim(), "D");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
+  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
+  return at::_pdist_forward(self.contiguous(), p);
+}
+```
+
+调用`_pdist_forward`，[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L244-L262)
+
+```cpp
+Tensor _pdist_forward(const Tensor& self, const double p) {
+  TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
+  auto device = self.device().type();
+  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
+  Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (self.size(0) <= 1) {
+    result.resize_({0});
+  } else {
+    int64_t n = self.size(0);
+    int64_t c = n * (n - 1) / 2;
+    result.resize_({c});
+    if (self.size(1) == 0) {
+      result.fill_(0);
+    } else {
+      pdist_forward_stub(device, result, self, p);
+    }
+  }
+  return result;
+}
+```
+
+主要调用`pdist_forward_stub`，绑定了具体的`pdist_forward_kernel_impl`
+
+```cpp
+REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
+```
+
+([CPU](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L446)和[CUDA](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cuda/DistanceKernel.cu#L360)实现绑定了同一个`pdist_forward_kernel_impl`)
+
+而后`pdist_forward_kernel_impl`的[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L419C1-L423C2)
+
+```cpp
+void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) {
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] {
+    Dist<scalar_t>::apply_pdist(result, self, p);
+  });
+}
+```
+
+调用`apply_pdist`，[代码位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L190-L202)
+
+```cpp
+ // Assumes self is nonempty, contiguous, and 2D
+  static void apply_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
+    if (p == 0.0) {
+      run_parallel_pdist<zdist_calc<Vec>>(result, self, p);
+    } else if (p == 1.0) {
+      run_parallel_pdist<odist_calc<Vec>>(result, self, p);
+    } else if (p == 2.0) {
+      run_parallel_pdist<tdist_calc<Vec>>(result, self, p);
+    } else if (std::isinf(p)) {
+      run_parallel_pdist<idist_calc<Vec>>(result, self, p);
+    } else {
+      run_parallel_pdist<pdist_calc<Vec>>(result, self, p);
+    }
+  }
+```
+
+`run_parallel_pdist`具体实现
+
+```cpp
+  template <typename F>
+  static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
+    const scalar_t * const self_start = self.data_ptr<scalar_t>();
+    const scalar_t * const self_end = self_start + self.numel();
+    int64_t n = self.size(0);
+    int64_t m = self.size(1);
+
+    scalar_t * const res_start = result.data_ptr<scalar_t>();
+    int64_t combs = result.numel(); // n * (n - 1) / 2
+
+    // We conceptually iterate over tuples of (i, j, k) where i is the first
+    // vector from the input, j is the second, and k is the result index. This
+    // parallelizes over the range of k and infers what i and j are from the
+    // value of k.
+    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
+      const Vec pvec(p);
+      double n2 = n - .5;
+      // The -1 accounts for floating point truncation issues
+      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
+      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+
+      const scalar_t * self_i = self_start + i * m;
+      const scalar_t * self_j = self_start + j * m;
+      scalar_t * res = res_start + k;
+      const scalar_t * const res_end = res_start + end;
+
+      while (res != res_end) {
+        *res = F::finish(vec::map2_reduce_all<scalar_t>(
+          [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); },
+          F::red, self_i, self_j, m), p);
+
+        res += 1;
+        self_j += m;
+        if (self_j == self_end) {
+          self_i += m;
+          self_j = self_i + m;
+        }
+      }
+    });
+  }
+```
+
+
+
+# 四、对比分析
+
+Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为paddle.cdist(x, y, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
+
+## API实现方案
+
+参考`Paddle.cdist`和与`Scipy`中的设计，组合已有API实现功能
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
+1. 当`x`、`y` 2D 的 Tensor，并如PyTorch给出合理提示
+
+   ```python
+   >>> a = []
+   >>> a = torch.tensor(a)
+   >>> b = torch.nn.functional.pdist(a)
+   Traceback (most recent call last):
+     File "<stdin>", line 1, in <module>
+   RuntimeError: pdist only supports 2D tensors, got: 1D
+   >>> b
+   ```
+
+   
+
+2. 结果一致性，和 SciPy 以及 PyTorch 结果的数值的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.nn.functional.pdist.html?highlight=pdist#torch.nn.functional.pdist)
+
+[Scipy文档](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html)
\ No newline at end of file

From 9d2918be55daeb72b367b116ae266d6827e2c83c Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Wed, 27 Sep 2023 16:42:51 +0800
Subject: [PATCH 06/16] fix typo

---
 rfcs/{ => APIs}/20230926_api_design_for_pdist.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename rfcs/{ => APIs}/20230926_api_design_for_pdist.md (98%)

diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
similarity index 98%
rename from rfcs/20230926_api_design_for_pdist.md
rename to rfcs/APIs/20230926_api_design_for_pdist.md
index 4ffe30875..ee64c3148 100644
--- a/rfcs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -364,11 +364,11 @@ Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
 
 ## 命名与参数设计
 
-API的设计为paddle.cdist(x, y, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
+API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
 
 ## API实现方案
 
-参考`Paddle.cdist`和与`Scipy`中的设计，组合已有API实现功能
+参考`Paddle.pdist`和与`Scipy`中的设计，组合已有API实现功能
 
 # 六、测试和验收的考量
 

From 3b3e350c49fe09b99f96cee32d4e2e46c5f4e2c7 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 06:39:54 +0800
Subject: [PATCH 07/16] fix

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index ee64c3148..71b46d1d3 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -368,7 +368,7 @@ API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Te
 
 ## API实现方案
 
-参考`Paddle.pdist`和与`Scipy`中的设计，组合已有API实现功能
+参考`PyTorch`与`Scipy`中的设计，组合已有API实现功能
 
 # 六、测试和验收的考量
 

From ab440c29329bba7e83f161da48b8a6183110a3de Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 06:58:23 +0800
Subject: [PATCH 08/16] add bitwise_shift rfc

---
 .../20230927_api_design_for_bitwise_shift.md  | 186 ++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
new file mode 100644
index 000000000..2f1f6911b
--- /dev/null
+++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
@@ -0,0 +1,186 @@
+# paddle.pdist设计文档
+
+| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
+| ------------ | --------------------------------------------------------- |
+| 提交作者     | coco                                                      |
+| 提交时间     | 2023-09-27                                                |
+| 版本号       | V1.0                                                      |
+| 依赖飞桨版本 | develop                                                   |
+| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
+
+# 一、概述
+
+## 1、相关背景
+
+为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
+
+## 2、功能目标
+
+通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
+
+## 3、意义
+
+飞桨支持直接对Tensor进行元素粒度的左移右移。
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
+
+介绍为：
+
+```
+Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
+```
+
+## 实现方法
+
+从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
+
+```python
+shift_right_arithmetic = _make_elementwise_binary_prim(
+    "shift_right_arithmetic",
+    impl_aten=torch.bitwise_right_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+```
+
+具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
+
+```python
+# TODO(fdrocha): this is currently not being used anywhere,
+# pending on moving triton pin past 972b761
+@staticmethod
+def bitwise_right_shift(x, y):
+    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
+```
+
+
+
+## Numpy
+
+- Parameters:
+
+  - **x1**：array_like, int
+
+    Input values.
+
+  - **x2**：array_like, int
+
+    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
+
+  - **out**：ndarray, None, or tuple of ndarray and None, optional
+
+    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
+
+  - **where**：array_like, optional
+
+    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
+
+  - **kwargs：
+
+    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
+
+Returns:
+
+- **out**：ndarray, int
+
+  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
+
+
+
+相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
+
+```cpp
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+```
+
+`npy_rshift`相关调用
+
+```cpp
+NPY_INPLACE npy_@u@@type@
+npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if @is_signed@
+    else if (a < 0) {
+        return (npy_@u@@type@)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+```
+
+# 四、对比分析
+
+PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
+
+## API实现方案
+
+参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
+1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
+
+2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file

From e3fe25c69c4dd868f98a3474a72afe70c132130a Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 07:24:06 +0800
Subject: [PATCH 09/16] update

---
 .../20230927_api_design_for_bitwise_shift.md  | 186 ------------------
 1 file changed, 186 deletions(-)
 delete mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
deleted file mode 100644
index 2f1f6911b..000000000
--- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# paddle.pdist设计文档
-
-| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
-| ------------ | --------------------------------------------------------- |
-| 提交作者     | coco                                                      |
-| 提交时间     | 2023-09-27                                                |
-| 版本号       | V1.0                                                      |
-| 依赖飞桨版本 | develop                                                   |
-| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
-
-# 一、概述
-
-## 1、相关背景
-
-为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
-
-## 2、功能目标
-
-通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
-
-## 3、意义
-
-飞桨支持直接对Tensor进行元素粒度的左移右移。
-
-# 二、飞桨现状
-
-目前paddle缺少相关功能实现。
-
-# 三、业内方案调研
-
-## PyTorch
-
-PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
-
-介绍为：
-
-```
-Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
-```
-
-## 实现方法
-
-从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
-
-```python
-shift_right_arithmetic = _make_elementwise_binary_prim(
-    "shift_right_arithmetic",
-    impl_aten=torch.bitwise_right_shift,
-    doc="",
-    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
-)
-```
-
-具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
-
-```python
-# TODO(fdrocha): this is currently not being used anywhere,
-# pending on moving triton pin past 972b761
-@staticmethod
-def bitwise_right_shift(x, y):
-    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
-```
-
-
-
-## Numpy
-
-- Parameters:
-
-  - **x1**：array_like, int
-
-    Input values.
-
-  - **x2**：array_like, int
-
-    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
-
-  - **out**：ndarray, None, or tuple of ndarray and None, optional
-
-    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
-
-  - **where**：array_like, optional
-
-    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
-
-  - **kwargs：
-
-    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
-
-Returns:
-
-- **out**：ndarray, int
-
-  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
-
-
-
-相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
-
-```cpp
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-#else
-    BINARY_LOOP {
-        @type@ in1 = *(@type@ *)ip1;
-        @type@ in2 = *(@type@ *)ip2;
-        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
-    }
-#endif
-}
-```
-
-`npy_rshift`相关调用
-
-```cpp
-NPY_INPLACE npy_@u@@type@
-npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
-{
-    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
-        return a >> b;
-    }
-#if @is_signed@
-    else if (a < 0) {
-        return (npy_@u@@type@)-1;  /* preserve the sign bit */
-    }
-#endif
-    else {
-        return 0;
-    }
-}
-```
-
-# 四、对比分析
-
-PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
-
-# 五、设计思路与实现方案
-
-## 命名与参数设计
-
-API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
-
-## API实现方案
-
-参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
-
-# 六、测试和验收的考量
-
-测试考虑的case如下：
-
-1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
-
-2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
-
-# 七、可行性分析及规划排期
-
-有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
-
-# 八、影响面
-
-为独立新增API，对其他模块没有影响
-
-# 名词解释
-
-无
-
-# 附件及参考资料
-
-[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
-
-[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file

From 68d27cf599fae8f826bd1efd76fc2e23b545388f Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 07:25:16 +0800
Subject: [PATCH 10/16] add bitwise rfc

---
 .../20230927_api_design_for_bitwise_shift.md  | 186 ++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
new file mode 100644
index 000000000..2f1f6911b
--- /dev/null
+++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
@@ -0,0 +1,186 @@
+# paddle.pdist设计文档
+
+| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
+| ------------ | --------------------------------------------------------- |
+| 提交作者     | coco                                                      |
+| 提交时间     | 2023-09-27                                                |
+| 版本号       | V1.0                                                      |
+| 依赖飞桨版本 | develop                                                   |
+| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
+
+# 一、概述
+
+## 1、相关背景
+
+为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
+
+## 2、功能目标
+
+通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
+
+## 3、意义
+
+飞桨支持直接对Tensor进行元素粒度的左移右移。
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
+
+介绍为：
+
+```
+Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
+```
+
+## 实现方法
+
+从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
+
+```python
+shift_right_arithmetic = _make_elementwise_binary_prim(
+    "shift_right_arithmetic",
+    impl_aten=torch.bitwise_right_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+```
+
+具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
+
+```python
+# TODO(fdrocha): this is currently not being used anywhere,
+# pending on moving triton pin past 972b761
+@staticmethod
+def bitwise_right_shift(x, y):
+    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
+```
+
+
+
+## Numpy
+
+- Parameters:
+
+  - **x1**：array_like, int
+
+    Input values.
+
+  - **x2**：array_like, int
+
+    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
+
+  - **out**：ndarray, None, or tuple of ndarray and None, optional
+
+    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
+
+  - **where**：array_like, optional
+
+    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
+
+  - **kwargs：
+
+    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
+
+Returns:
+
+- **out**：ndarray, int
+
+  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
+
+
+
+相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
+
+```cpp
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+```
+
+`npy_rshift`相关调用
+
+```cpp
+NPY_INPLACE npy_@u@@type@
+npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if @is_signed@
+    else if (a < 0) {
+        return (npy_@u@@type@)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+```
+
+# 四、对比分析
+
+PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
+
+## API实现方案
+
+参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
+1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
+
+2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file

From 7aff2ebd0a96323128886edc4a46b13c90278d9c Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 07:29:15 +0800
Subject: [PATCH 11/16] update

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 409 ---------------------
 1 file changed, 409 deletions(-)
 delete mode 100644 rfcs/APIs/20230926_api_design_for_pdist.md

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
deleted file mode 100644
index 71b46d1d3..000000000
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ /dev/null
@@ -1,409 +0,0 @@
-# paddle.pdist设计文档
-
-| API 名称     | paddle.pdist                  |
-| ------------ | ----------------------------- |
-| 提交作者     | coco                          |
-| 提交时间     | 2023-09-26                    |
-| 版本号       | V1.0                          |
-| 依赖飞桨版本 | develop                       |
-| 文件名       | 20230926_api_defign_for_pdist |
-
-# 一、概述
-
-## 1、相关背景
-
-为paddle新增该API，为计算N个向量两两之间的p-norm距离。
-
-## 2、功能目标
-
-一个矩阵`A`的大小为`MxN`，那么`B=pdist(A)`得到的矩阵B的大小为1行`M*(M-1)/2`列，表示的意义是M行数据，每两行计算一下p-norm距离，默认欧式距离。例如a = [[0.0, 1.0],[2.0,3.0],[4.0,5.0],[6.0,7.0]]，输出为[2.8284, 5.6569, 8.4853, 2.8284, 5.6569, 2.8284]。输出顺序为distance(第一行,第二行), distance(第一行,第三行), ... distance(第二行,第三行)...
-
-## 3、意义
-
-飞桨支持直接两两计算向量间的距离。
-
-# 二、飞桨现状
-
-目前paddle缺少相关功能实现。
-
-# 三、业内方案调研
-
-## Scipy
-
-Scipy中有API`scipy.spatial.distance.pdist`
-
-在Scipy中介绍为：
-
-```
-Pairwise distances between observations in n-dimensional space.
-```
-
-## 实现方法
-
-从实现方法上，Scipy是通过py实现的，[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L2195-L2233)
-
-```python
-    X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
-                           check_finite=False)
-
-    s = X.shape
-    if len(s) != 2:
-        raise ValueError('A 2-dimensional array must be passed.')
-
-    m, n = s
-
-    if callable(metric):
-        mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
-        metric_info = _METRIC_ALIAS.get(mstr, None)
-
-        if metric_info is not None:
-            X, typ, kwargs = _validate_pdist_input(
-                X, m, n, metric_info, **kwargs)
-
-        return _pdist_callable(X, metric=metric, out=out, **kwargs)
-    elif isinstance(metric, str):
-        mstr = metric.lower()
-        metric_info = _METRIC_ALIAS.get(mstr, None)
-
-        if metric_info is not None:
-            pdist_fn = metric_info.pdist_func
-            _extra_windows_error_checks(X, out, (m * (m - 1) / 2,), **kwargs)
-            return pdist_fn(X, out=out, **kwargs)
-        elif mstr.startswith("test_"):
-            metric_info = _TEST_METRICS.get(mstr, None)
-            if metric_info is None:
-                raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
-            X, typ, kwargs = _validate_pdist_input(
-                X, m, n, metric_info, **kwargs)
-            return _pdist_callable(
-                X, metric=metric_info.dist_func, out=out, **kwargs)
-        else:
-            raise ValueError('Unknown Distance Metric: %s' % mstr)
-    else:
-        raise TypeError('2nd argument metric must be a string identifier '
-                        'or a function.')
-```
-
-先找到`mertric`对应的函数，然后call调用，例如`metric`为`euclidean`时，调用`euclidean`的函数。[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L1781C1-L1787C7)
-
-
-
-```python
-    MetricInfo(
-        canonical_name='euclidean',
-        aka={'euclidean', 'euclid', 'eu', 'e'},
-        dist_func=euclidean,
-        cdist_func=_distance_pybind.cdist_euclidean,
-        pdist_func=_distance_pybind.pdist_euclidean,
-    ),
-```
-
-[euclidean调用minkowski](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L500-L536)和[minkowski实现](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L429-L497)
-
-```python
-def euclidean(u, v, w=None):
-    return minkowski(u, v, p=2, w=w)
-
-
-def minkowski(u, v, p=2, w=None):
-    u = _validate_vector(u)
-    v = _validate_vector(v)
-    if p <= 0:
-        raise ValueError("p must be greater than 0")
-    u_v = u - v
-    if w is not None:
-        w = _validate_weights(w)
-        if p == 1:
-            root_w = w
-        elif p == 2:
-            # better precision and speed
-            root_w = np.sqrt(w)
-        elif p == np.inf:
-            root_w = (w != 0)
-        else:
-            root_w = np.power(w, 1/p)
-        u_v = root_w * u_v
-    dist = norm(u_v, ord=p)
-    return dist
-```
-
-主要是调用`norm`实现计算
-
-```python
-def norm(x, ord=None, axis=None):
-    if not issparse(x):
-        raise TypeError("input is not sparse. use numpy.linalg.norm")
-
-    # Check the default case first and handle it immediately.
-    if axis is None and ord in (None, 'fro', 'f'):
-        return _sparse_frobenius_norm(x)
-
-    # Some norms require functions that are not implemented for all types.
-    x = x.tocsr()
-
-    if axis is None:
-        axis = (0, 1)
-    elif not isinstance(axis, tuple):
-        msg = "'axis' must be None, an integer or a tuple of integers"
-        try:
-            int_axis = int(axis)
-        except TypeError as e:
-            raise TypeError(msg) from e
-        if axis != int_axis:
-            raise TypeError(msg)
-        axis = (int_axis,)
-
-    nd = 2
-    if len(axis) == 2:
-        row_axis, col_axis = axis
-        if not (-nd <= row_axis < nd and -nd <= col_axis < nd):
-            raise ValueError('Invalid axis %r for an array with shape %r' %
-                             (axis, x.shape))
-        if row_axis % nd == col_axis % nd:
-            raise ValueError('Duplicate axes given.')
-        if ord == 2:
-            # Only solver="lobpcg" supports all numpy dtypes
-            _, s, _ = svds(x, k=1, solver="lobpcg")
-            return s[0]
-        elif ord == -2:
-            raise NotImplementedError
-            #return _multi_svd_norm(x, row_axis, col_axis, amin)
-        elif ord == 1:
-            return abs(x).sum(axis=row_axis).max(axis=col_axis)[0,0]
-        elif ord == np.inf:
-            return abs(x).sum(axis=col_axis).max(axis=row_axis)[0,0]
-        elif ord == -1:
-            return abs(x).sum(axis=row_axis).min(axis=col_axis)[0,0]
-        elif ord == -np.inf:
-            return abs(x).sum(axis=col_axis).min(axis=row_axis)[0,0]
-        elif ord in (None, 'f', 'fro'):
-            # The axis order does not matter for this norm.
-            return _sparse_frobenius_norm(x)
-        else:
-            raise ValueError("Invalid norm order for matrices.")
-    elif len(axis) == 1:
-        a, = axis
-        if not (-nd <= a < nd):
-            raise ValueError('Invalid axis %r for an array with shape %r' %
-                             (axis, x.shape))
-        if ord == np.inf:
-            M = abs(x).max(axis=a)
-        elif ord == -np.inf:
-            M = abs(x).min(axis=a)
-        elif ord == 0:
-            # Zero norm
-            M = (x != 0).sum(axis=a)
-        elif ord == 1:
-            # special case for speedup
-            M = abs(x).sum(axis=a)
-        elif ord in (2, None):
-            M = sqrt(abs(x).power(2).sum(axis=a))
-        else:
-            try:
-                ord + 1
-            except TypeError as e:
-                raise ValueError('Invalid norm order for vectors.') from e
-            M = np.power(abs(x).power(ord).sum(axis=a), 1 / ord)
-        if hasattr(M, 'toarray'):
-            return M.toarray().ravel()
-        elif hasattr(M, 'A'):
-            return M.A.ravel()
-        else:
-            return M.ravel()
-    else:
-        raise ValueError("Improper number of dimensions to norm.")
-```
-
-
-
-
-
-
-
-
-
-## PyTorch
-
-Parameters:
-
-- **input** – input tensor of shape N×M.
-- **p** – p value for the p-norm distance to calculate between each vector pair ∈[0,∞]∈[0,∞].
-
-并且有相关描述：
-
-This function is equivalent to `scipy.spatial.distance.pdist(input, 'minkowski', p=p)` if p∈(0,∞). When p=0 it is equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`. When p=∞, the closest scipy function is `scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`.
-
-
-
-相关[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L58-L64)
-
-```cpp
-Tensor pdist(const Tensor& self, const double p) {
-  TORCH_CHECK(self.dim() == 2,
-      "pdist only supports 2D tensors, got: ", self.dim(), "D");
-  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
-  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
-  return at::_pdist_forward(self.contiguous(), p);
-}
-```
-
-调用`_pdist_forward`，[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L244-L262)
-
-```cpp
-Tensor _pdist_forward(const Tensor& self, const double p) {
-  TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
-  auto device = self.device().type();
-  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
-  Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  if (self.size(0) <= 1) {
-    result.resize_({0});
-  } else {
-    int64_t n = self.size(0);
-    int64_t c = n * (n - 1) / 2;
-    result.resize_({c});
-    if (self.size(1) == 0) {
-      result.fill_(0);
-    } else {
-      pdist_forward_stub(device, result, self, p);
-    }
-  }
-  return result;
-}
-```
-
-主要调用`pdist_forward_stub`，绑定了具体的`pdist_forward_kernel_impl`
-
-```cpp
-REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
-```
-
-([CPU](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L446)和[CUDA](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cuda/DistanceKernel.cu#L360)实现绑定了同一个`pdist_forward_kernel_impl`)
-
-而后`pdist_forward_kernel_impl`的[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L419C1-L423C2)
-
-```cpp
-void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) {
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] {
-    Dist<scalar_t>::apply_pdist(result, self, p);
-  });
-}
-```
-
-调用`apply_pdist`，[代码位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L190-L202)
-
-```cpp
- // Assumes self is nonempty, contiguous, and 2D
-  static void apply_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
-    if (p == 0.0) {
-      run_parallel_pdist<zdist_calc<Vec>>(result, self, p);
-    } else if (p == 1.0) {
-      run_parallel_pdist<odist_calc<Vec>>(result, self, p);
-    } else if (p == 2.0) {
-      run_parallel_pdist<tdist_calc<Vec>>(result, self, p);
-    } else if (std::isinf(p)) {
-      run_parallel_pdist<idist_calc<Vec>>(result, self, p);
-    } else {
-      run_parallel_pdist<pdist_calc<Vec>>(result, self, p);
-    }
-  }
-```
-
-`run_parallel_pdist`具体实现
-
-```cpp
-  template <typename F>
-  static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
-    const scalar_t * const self_start = self.data_ptr<scalar_t>();
-    const scalar_t * const self_end = self_start + self.numel();
-    int64_t n = self.size(0);
-    int64_t m = self.size(1);
-
-    scalar_t * const res_start = result.data_ptr<scalar_t>();
-    int64_t combs = result.numel(); // n * (n - 1) / 2
-
-    // We conceptually iterate over tuples of (i, j, k) where i is the first
-    // vector from the input, j is the second, and k is the result index. This
-    // parallelizes over the range of k and infers what i and j are from the
-    // value of k.
-    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
-      const Vec pvec(p);
-      double n2 = n - .5;
-      // The -1 accounts for floating point truncation issues
-      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
-      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
-
-      const scalar_t * self_i = self_start + i * m;
-      const scalar_t * self_j = self_start + j * m;
-      scalar_t * res = res_start + k;
-      const scalar_t * const res_end = res_start + end;
-
-      while (res != res_end) {
-        *res = F::finish(vec::map2_reduce_all<scalar_t>(
-          [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); },
-          F::red, self_i, self_j, m), p);
-
-        res += 1;
-        self_j += m;
-        if (self_j == self_end) {
-          self_i += m;
-          self_j = self_i + m;
-        }
-      }
-    });
-  }
-```
-
-
-
-# 四、对比分析
-
-Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
-
-# 五、设计思路与实现方案
-
-## 命名与参数设计
-
-API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
-
-## API实现方案
-
-参考`PyTorch`与`Scipy`中的设计，组合已有API实现功能
-
-# 六、测试和验收的考量
-
-测试考虑的case如下：
-
-1. 当`x`、`y` 2D 的 Tensor，并如PyTorch给出合理提示
-
-   ```python
-   >>> a = []
-   >>> a = torch.tensor(a)
-   >>> b = torch.nn.functional.pdist(a)
-   Traceback (most recent call last):
-     File "<stdin>", line 1, in <module>
-   RuntimeError: pdist only supports 2D tensors, got: 1D
-   >>> b
-   ```
-
-   
-
-2. 结果一致性，和 SciPy 以及 PyTorch 结果的数值的一致性
-
-# 七、可行性分析及规划排期
-
-有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
-
-# 八、影响面
-
-为独立新增API，对其他模块没有影响
-
-# 名词解释
-
-无
-
-# 附件及参考资料
-
-[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.nn.functional.pdist.html?highlight=pdist#torch.nn.functional.pdist)
-
-[Scipy文档](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html)
\ No newline at end of file

From 703a1db361228e0fcc793a083da7640ef94eeb11 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 21:05:31 +0800
Subject: [PATCH 12/16] update jax shift doc

---
 .../20230927_api_design_for_bitwise_shift.md  | 86 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 3 deletions(-)

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
index 2f1f6911b..376a96fb5 100644
--- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
+++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
@@ -40,7 +40,7 @@ Computes the right arithmetic shift of input by other bits. The input tensor mus
 
 ## 实现方法
 
-从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
+从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1151)
 
 ```python
 shift_right_arithmetic = _make_elementwise_binary_prim(
@@ -49,6 +49,8 @@ shift_right_arithmetic = _make_elementwise_binary_prim(
     doc="",
     type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
 )
+
+shift_right_logical = _not_impl  # 可见pytorch中仅支持算数位移
 ```
 
 具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
@@ -145,19 +147,97 @@ npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
 }
 ```
 
+
+
+## Jax
+
+算数位移
+
+- jax.lax.**shift_right_arithmetic**(*x*, *y*)[[source\]](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_arithmetic)
+
+  Elementwise arithmetic right shift: x ≫ y.
+
+  Parameters
+
+  - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
+  - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
+
+  Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array)
+
+具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1748-L1760)
+
+```python
+# Note: Bitwise operations only yield identical results on unsigned integers!
+# pylint: disable=protected-access
+def _shift_right_arithmetic_raw(x, y):
+  if x.dtype.is_unsigned:
+    assert x.dtype == y.dtype
+    orig_dtype = x.dtype
+    signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[orig_dtype]
+    x = tf.cast(x, signed_dtype)
+    y = tf.cast(y, signed_dtype)
+    res = tf.bitwise.right_shift(x, y)
+    return tf.cast(res, orig_dtype)
+  else:
+    return tf.bitwise.right_shift(x, y)
+```
+
+>  在算术位移过程中，如果是有符号数，则直接进行算数位移；如果是无符号数，则先转换成对应的有符号数，再进行位移（因为算术右移时常常需要补符号位，只有有符号数才能保证结果正确性），最后转回无符号数。
+
+
+
+逻辑位移
+
++ jax.lax.**shift_right_logical**(*x*, *y*) [source](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_logical)
+
+  Elementwise logical right shift: �≫�.Parameters
+
+  - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
+  - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
+
+  Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array)
+
+
+
+具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1776-L1786)
+
+```python
+def _shift_right_logical_raw(x, y):
+  if x.dtype.is_unsigned:
+    return tf.bitwise.right_shift(x, y)
+  else:
+    assert x.dtype == y.dtype
+    orig_dtype = x.dtype
+    unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[orig_dtype]
+    x = tf.cast(x, unsigned_dtype)
+    y = tf.cast(y, unsigned_dtype)
+    res = tf.bitwise.right_shift(x, y)
+    return tf.cast(res, orig_dtype)
+```
+
+> 在逻辑位移过程中，如果是无符号数，则直接进行算数位移；如果是有符号数，则先转换成无符号数，再进行位移（因为逻辑右移时常常需要补0，只有无符号数才能保证结果正确性），最后转回有符号数。
+
+
+
+
+
 # 四、对比分析
 
 PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
 
+同时，PyTorch与Numpy中都仅支持算术位移，不支持逻辑位移，而JAX中实现了算术位移和逻辑位移。
+
+
+
 # 五、设计思路与实现方案
 
 ## 命名与参数设计
 
-API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
+API的设计为`paddle.bitwise_right_shift(x, y, is_arithmetic=True)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int；`is_arithmetic` 为bool类型，默认为 `True` 表示算术位移，当其为 `False` 时则为逻辑位移。
 
 ## API实现方案
 
-参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
+参考`PyTorch`、`Numpy`、`JAX`中的设计，组合已有API实现功能
 
 # 六、测试和验收的考量
 

From 23a6b8db5d904860c810cee9c270f27b0ee2cd79 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Mon, 2 Oct 2023 00:23:37 +0800
Subject: [PATCH 13/16] add histogramdd rfc

---
 .../20230927_api_design_for_bitwise_shift.md  | 266 ----------------
 .../20231001_api_design_for_histogramdd.md    | 294 ++++++++++++++++++
 2 files changed, 294 insertions(+), 266 deletions(-)
 delete mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md
 create mode 100644 rfcs/APIs/20231001_api_design_for_histogramdd.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
deleted file mode 100644
index 376a96fb5..000000000
--- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
+++ /dev/null
@@ -1,266 +0,0 @@
-# paddle.pdist设计文档
-
-| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
-| ------------ | --------------------------------------------------------- |
-| 提交作者     | coco                                                      |
-| 提交时间     | 2023-09-27                                                |
-| 版本号       | V1.0                                                      |
-| 依赖飞桨版本 | develop                                                   |
-| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
-
-# 一、概述
-
-## 1、相关背景
-
-为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
-
-## 2、功能目标
-
-通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
-
-## 3、意义
-
-飞桨支持直接对Tensor进行元素粒度的左移右移。
-
-# 二、飞桨现状
-
-目前paddle缺少相关功能实现。
-
-# 三、业内方案调研
-
-## PyTorch
-
-PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
-
-介绍为：
-
-```
-Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
-```
-
-## 实现方法
-
-从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1151)
-
-```python
-shift_right_arithmetic = _make_elementwise_binary_prim(
-    "shift_right_arithmetic",
-    impl_aten=torch.bitwise_right_shift,
-    doc="",
-    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
-)
-
-shift_right_logical = _not_impl  # 可见pytorch中仅支持算数位移
-```
-
-具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
-
-```python
-# TODO(fdrocha): this is currently not being used anywhere,
-# pending on moving triton pin past 972b761
-@staticmethod
-def bitwise_right_shift(x, y):
-    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
-```
-
-
-
-## Numpy
-
-- Parameters:
-
-  - **x1**：array_like, int
-
-    Input values.
-
-  - **x2**：array_like, int
-
-    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
-
-  - **out**：ndarray, None, or tuple of ndarray and None, optional
-
-    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
-
-  - **where**：array_like, optional
-
-    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
-
-  - **kwargs：
-
-    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
-
-Returns:
-
-- **out**：ndarray, int
-
-  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
-
-
-
-相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
-
-```cpp
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-#else
-    BINARY_LOOP {
-        @type@ in1 = *(@type@ *)ip1;
-        @type@ in2 = *(@type@ *)ip2;
-        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
-    }
-#endif
-}
-```
-
-`npy_rshift`相关调用
-
-```cpp
-NPY_INPLACE npy_@u@@type@
-npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
-{
-    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
-        return a >> b;
-    }
-#if @is_signed@
-    else if (a < 0) {
-        return (npy_@u@@type@)-1;  /* preserve the sign bit */
-    }
-#endif
-    else {
-        return 0;
-    }
-}
-```
-
-
-
-## Jax
-
-算数位移
-
-- jax.lax.**shift_right_arithmetic**(*x*, *y*)[[source\]](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_arithmetic)
-
-  Elementwise arithmetic right shift: x ≫ y.
-
-  Parameters
-
-  - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
-  - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
-
-  Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array)
-
-具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1748-L1760)
-
-```python
-# Note: Bitwise operations only yield identical results on unsigned integers!
-# pylint: disable=protected-access
-def _shift_right_arithmetic_raw(x, y):
-  if x.dtype.is_unsigned:
-    assert x.dtype == y.dtype
-    orig_dtype = x.dtype
-    signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[orig_dtype]
-    x = tf.cast(x, signed_dtype)
-    y = tf.cast(y, signed_dtype)
-    res = tf.bitwise.right_shift(x, y)
-    return tf.cast(res, orig_dtype)
-  else:
-    return tf.bitwise.right_shift(x, y)
-```
-
->  在算术位移过程中，如果是有符号数，则直接进行算数位移；如果是无符号数，则先转换成对应的有符号数，再进行位移（因为算术右移时常常需要补符号位，只有有符号数才能保证结果正确性），最后转回无符号数。
-
-
-
-逻辑位移
-
-+ jax.lax.**shift_right_logical**(*x*, *y*) [source](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_logical)
-
-  Elementwise logical right shift: �≫�.Parameters
-
-  - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
-  - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) 
-
-  Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array)
-
-
-
-具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1776-L1786)
-
-```python
-def _shift_right_logical_raw(x, y):
-  if x.dtype.is_unsigned:
-    return tf.bitwise.right_shift(x, y)
-  else:
-    assert x.dtype == y.dtype
-    orig_dtype = x.dtype
-    unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[orig_dtype]
-    x = tf.cast(x, unsigned_dtype)
-    y = tf.cast(y, unsigned_dtype)
-    res = tf.bitwise.right_shift(x, y)
-    return tf.cast(res, orig_dtype)
-```
-
-> 在逻辑位移过程中，如果是无符号数，则直接进行算数位移；如果是有符号数，则先转换成无符号数，再进行位移（因为逻辑右移时常常需要补0，只有无符号数才能保证结果正确性），最后转回有符号数。
-
-
-
-
-
-# 四、对比分析
-
-PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
-
-同时，PyTorch与Numpy中都仅支持算术位移，不支持逻辑位移，而JAX中实现了算术位移和逻辑位移。
-
-
-
-# 五、设计思路与实现方案
-
-## 命名与参数设计
-
-API的设计为`paddle.bitwise_right_shift(x, y, is_arithmetic=True)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int；`is_arithmetic` 为bool类型，默认为 `True` 表示算术位移，当其为 `False` 时则为逻辑位移。
-
-## API实现方案
-
-参考`PyTorch`、`Numpy`、`JAX`中的设计，组合已有API实现功能
-
-# 六、测试和验收的考量
-
-测试考虑的case如下：
-
-1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
-
-2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
-
-# 七、可行性分析及规划排期
-
-有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
-
-# 八、影响面
-
-为独立新增API，对其他模块没有影响
-
-# 名词解释
-
-无
-
-# 附件及参考资料
-
-[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
-
-[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file
diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md
new file mode 100644
index 000000000..f1b483367
--- /dev/null
+++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md
@@ -0,0 +1,294 @@
+# paddle.histogramdd设计文档
+
+| API 名称     | paddle.histogramdd                  |
+| ------------ | ----------------------------------- |
+| 提交作者     | coco                                |
+| 提交时间     | 2023-10-01                          |
+| 版本号       | V1.0                                |
+| 依赖飞桨版本 | develop                             |
+| 文件名       | 20231001_api_defign_for_histogramdd |
+
+# 一、概述
+
+## 1、相关背景
+
+为了提升飞桨API丰富度，Paddle需要扩充API，调用路径为：
+
+- paddle.histogramdd
+
+## 2、功能目标
+
+实现多维的histogram直方图计算
+
+## 3、意义
+
+飞桨支持为多维tensor进行直方图计算
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API `torch.histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])` 
+
+## 实现方法
+
+从实现方法上，PyTorch是通过c++实现的，[代码位置](https://github.com/pytorch/pytorch/blob/e4414716d55c14d86298d6434e23d47605532cca/aten/src/ATen/native/cpu/HistogramKernel.cpp#L207-L255)
+
+```cpp
+/* Some pre- and post- processing steps for the main algorithm.
+ * Initializes hist to 0, calls into the main algorithm, and normalizes output if necessary.
+ */
+template<BIN_SELECTION_ALGORITHM bin_algorithm>
+void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor>& weight, bool density,
+        Tensor& hist, const TensorList& bin_edges) {
+    hist.fill_(0);
+
+    const int64_t N = self.size(-1);
+    const int64_t M = std::accumulate(self.sizes().begin(), self.sizes().end() - 1,
+            (int64_t)1, std::multiplies<int64_t>());
+
+    const Tensor reshaped_input = self.reshape({M, N});
+
+    const auto reshaped_weight = weight.has_value()
+            ? c10::optional<Tensor>(weight.value().reshape({M}))
+            : c10::optional<Tensor>();
+
+    std::vector<Tensor> bin_edges_contig(bin_edges.size());
+    for (const auto dim : c10::irange(bin_edges_contig.size())) {
+        bin_edges_contig[dim] = bin_edges[dim].contiguous();
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "histogram_cpu", [&]() {
+        histogramdd_cpu_contiguous<scalar_t, bin_algorithm>(
+                hist, bin_edges_contig, reshaped_input, reshaped_weight);
+    });
+
+    /* Divides each bin's value by the total count/weight in all bins,
+     * and by the bin's volume.
+     */
+    if (density) {
+        const auto hist_sum = hist.sum().item();
+        hist.div_(hist_sum);
+
+         /* For each dimension, divides each bin's value
+          * by the bin's length in that dimension.
+          */
+        for (const auto dim : c10::irange(N)) {
+            const auto bin_lengths = bin_edges[dim].diff();
+
+            // Used to reshape bin_lengths to align with the corresponding dimension of hist.
+            std::vector<int64_t> shape(N, 1);
+            shape[dim] = bin_lengths.numel();
+
+            hist.div_(bin_lengths.reshape(shape));
+        }
+    }
+}
+```
+
+
+
+
+
+## TensorFlow
+
+无`histogramdd`实现
+
+## Numpy
+
+numpy.**histogramdd**(*sample*, *bins=10*, *range=None*, *density=None*, *weights=None*)[[source\]](https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/histograms.py#L901-L1072)
+
+Compute the multidimensional histogram of some data.
+
+- Parameters:
+
+  **sample**: (N, D) array, or (N, D) array_likeThe data to be histogrammed.Note the unusual interpretation of sample when an array_like:When an array, each row is a coordinate in a D-dimensional space - such as `histogramdd(np.array([p1, p2, p3]))`.When an array_like, each element is the list of values for single coordinate - such as `histogramdd((X, Y, Z))`.The first form should be preferred.
+
+  **bins**: sequence or int, optionalThe bin specification:A sequence of arrays describing the monotonically increasing bin edges along each dimension.The number of bins for each dimension (nx, ny, … =bins)The number of bins for all dimensions (nx=ny=…=bins).
+
+  **range**: sequence, optionalA sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in *bins*. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values.
+
+  **density**: bool, optionalIf False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, `bin_count / sample_count / bin_volume`.
+
+  **weights**: (N,) array_like, optionalAn array of values *w_i* weighing each sample *(x_i, y_i, z_i, …)*. Weights are normalized to 1 if density is True. If density is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin.
+
+- Returns:
+
+  **H**: ndarrayThe multidimensional histogram of sample x. See density and weights for the different possible semantics.
+
+  **edges**: listA list of D arrays describing the bin edges for each dimension.
+
+### 实现方法
+
+先模板生成函数，底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/histograms.py#L901-L1072)
+
+```python
+@array_function_dispatch(_histogramdd_dispatcher)
+def histogramdd(sample, bins=10, range=None, density=None, weights=None):
+	try:
+        # Sample is an ND-array.
+        N, D = sample.shape
+    except (AttributeError, ValueError):
+        # Sample is a sequence of 1D arrays.
+        sample = np.atleast_2d(sample).T
+        N, D = sample.shape
+
+    nbin = np.empty(D, np.intp)
+    edges = D*[None]
+    dedges = D*[None]
+    if weights is not None:
+        weights = np.asarray(weights)
+
+    try:
+        M = len(bins)
+        if M != D:
+            raise ValueError(
+                'The dimension of bins must be equal to the dimension of the '
+                'sample x.')
+    except TypeError:
+        # bins is an integer
+        bins = D*[bins]
+
+    # normalize the range argument
+    if range is None:
+        range = (None,) * D
+    elif len(range) != D:
+        raise ValueError('range argument must have one entry per dimension')
+
+    # Create edge arrays
+    for i in _range(D):
+        if np.ndim(bins[i]) == 0:
+            if bins[i] < 1:
+                raise ValueError(
+                    '`bins[{}]` must be positive, when an integer'.format(i))
+            smin, smax = _get_outer_edges(sample[:,i], range[i])
+            try:
+                n = operator.index(bins[i])
+
+            except TypeError as e:
+                raise TypeError(
+                	"`bins[{}]` must be an integer, when a scalar".format(i)
+                ) from e
+
+            edges[i] = np.linspace(smin, smax, n + 1)
+        elif np.ndim(bins[i]) == 1:
+            edges[i] = np.asarray(bins[i])
+            if np.any(edges[i][:-1] > edges[i][1:]):
+                raise ValueError(
+                    '`bins[{}]` must be monotonically increasing, when an array'
+                    .format(i))
+        else:
+            raise ValueError(
+                '`bins[{}]` must be a scalar or 1d array'.format(i))
+
+        nbin[i] = len(edges[i]) + 1  # includes an outlier on each end
+        dedges[i] = np.diff(edges[i])
+
+    # Compute the bin number each sample falls into.
+    Ncount = tuple(
+        # avoid np.digitize to work around gh-11022
+        np.searchsorted(edges[i], sample[:, i], side='right')
+        for i in _range(D)
+    )
+
+    # Using digitize, values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right edge to be
+    # counted in the last bin, and not as an outlier.
+    for i in _range(D):
+        # Find which points are on the rightmost edge.
+        on_edge = (sample[:, i] == edges[i][-1])
+        # Shift these points one bin to the left.
+        Ncount[i][on_edge] -= 1
+
+    # Compute the sample indices in the flattened histogram matrix.
+    # This raises an error if the array is too large.
+    xy = np.ravel_multi_index(Ncount, nbin)
+
+    # Compute the number of repetitions in xy and assign it to the
+    # flattened histmat.
+    hist = np.bincount(xy, weights, minlength=nbin.prod())
+
+    # Shape into a proper matrix
+    hist = hist.reshape(nbin)
+
+    # This preserves the (bad) behavior observed in gh-7845, for now.
+    hist = hist.astype(float, casting='safe')
+
+    # Remove outliers (indices 0 and -1 for each dimension).
+    core = D*(slice(1, -1),)
+    hist = hist[core]
+
+    if density:
+        # calculate the probability density function
+        s = hist.sum()
+        for i in _range(D):
+            shape = np.ones(D, int)
+            shape[i] = nbin[i] - 2
+            hist = hist / dedges[i].reshape(shape)
+        hist /= s
+
+    if (hist.shape != nbin - 2).any():
+        raise RuntimeError(
+            "Internal Shape Error")
+    return hist, edges
+```
+
+
+
+# 四、对比分析
+
+PyTorch底层用cpp实现kernel，Numpy通过API在Python层直接实现。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为:
+
+- paddle.histogramdd(sample, bins, range=None, density=None, weights=None，name=None)
+
+其中
+
++ sample(Tensor) - 输入的多维 tensor
++ bins(Tensor[], int[], int) 若为`Tensor[]`，则定义了bin的边缘序列；若为`int[]`，则每个值分别定义了每个维度的等宽bin的数量；若为`int`，则定义了所有维度的等宽bin的数量。
++ range(*sequence of python:float*)：规定了bin的最左端和最右端，也就是范围。若为None则以所有输入的最小值和最大值作为边界。
++ weight(Tensor): 默认所有输入权重为1，他的shape必须与输入sample除去最内部维度的shape相同，例如当sample的shape为[M,N]时，weight的shape必须为[M]。
+
+## 底层OP设计
+
+参考PyTorch与Numpy中的设计，通过组合Python API实现功能。
+
+## API实现方案
+
+1. 在 Paddle repo 的 python/paddle/linalg.py 文件中实现Python。
+4. 编写文档
+5. 编写单测
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
++ 验证正确性，与PyTorch中的API计算结果进行对比。
++ 修改参数，对不同参数的功能以及边界值进行测试。
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.histogramdd.html?highlight=histogramdd#torch.histogramdd)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.histogramdd.html#numpy-histogramdd)
\ No newline at end of file

From d3770937b6c3e833ed464e83255d57f8ce54ac75 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Wed, 4 Oct 2023 23:40:47 +0800
Subject: [PATCH 14/16] add some detail

---
 .../20231001_api_design_for_histogramdd.md    | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md
index f1b483367..05bf2b087 100644
--- a/rfcs/APIs/20231001_api_design_for_histogramdd.md
+++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md
@@ -90,6 +90,138 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor
 }
 ```
 
+具体在`histogramdd_cpu_contiguous`中的实现，[代码位置](https://github.com/pytorch/pytorch/blob/4c3d3b717619d9d09e95878c1f01bfb5d4fee0d0/aten/src/ATen/native/cpu/HistogramKernel.cpp#L79C3-L205)
+
+```cpp
+template<typename input_t, BIN_SELECTION_ALGORITHM algorithm>
+void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
+        const Tensor& input, const c10::optional<Tensor>& weight) {
+    TORCH_INTERNAL_ASSERT(input.dim() == 2);
+
+    const int64_t N = input.size(0);
+    if (weight.has_value()) {
+        TORCH_INTERNAL_ASSERT(weight.value().dim() == 1 && weight.value().numel() == N);
+    }
+
+    const int64_t D = input.size(1);
+    TORCH_INTERNAL_ASSERT(int64_t(bin_edges.size()) == D);
+    for (const auto dim : c10::irange(D)) {
+        TORCH_INTERNAL_ASSERT(bin_edges[dim].is_contiguous());
+        TORCH_INTERNAL_ASSERT(hist.size(dim) + 1 == bin_edges[dim].numel());
+    }
+
+    if (D == 0) {
+        // hist is an empty tensor in this case; nothing to do here
+        return;
+    }
+
+    TensorAccessor<input_t, 2> accessor_in = input.accessor<input_t, 2>();
+
+    /* Constructs a c10::optional<TensorAccessor> containing an accessor iff
+     * the optional weight tensor has a value.
+     */
+    const auto accessor_wt = weight.has_value()
+            ? c10::optional<TensorAccessor<input_t, 1>>(weight.value().accessor<input_t, 1>())
+            : c10::optional<TensorAccessor<input_t, 1>>();
+
+    std::vector<input_t*> bin_seq(D);
+    std::vector<int64_t> num_bin_edges(D);
+    std::vector<input_t> leftmost_edge(D), rightmost_edge(D);
+
+    for (const auto dim : c10::irange(D)) {
+        bin_seq[dim] = bin_edges[dim].data_ptr<input_t>();
+        num_bin_edges[dim] = bin_edges[dim].numel();
+        leftmost_edge[dim] = bin_seq[dim][0];
+        rightmost_edge[dim] = bin_seq[dim][num_bin_edges[dim] - 1];
+    }
+
+    int64_t GRAIN_SIZE = std::max(int64_t(1), HISTOGRAM_GRAIN_SIZE / D);
+
+    /* Parallelizes processing of input using at::parallel_for.
+     * Each thread accumulates a local result into their own slice of
+     * thread_histograms which get summed together at the end.
+     */
+    const auto num_threads = at::get_num_threads();
+    const auto hist_sizes = hist.sizes();
+    DimVector thread_hist_sizes(hist_sizes.size() + 1);
+    thread_hist_sizes[0] = num_threads;
+    std::copy(hist_sizes.begin(), hist_sizes.end(),
+              thread_hist_sizes.begin() + 1);
+    Tensor thread_histograms = at::zeros(thread_hist_sizes, hist.dtype());
+    TORCH_INTERNAL_ASSERT(thread_histograms.is_contiguous());
+
+    at::parallel_for(0, N, GRAIN_SIZE, [&](int64_t start, int64_t end) {
+        const auto tid = at::get_thread_num();
+        auto hist_strides = thread_histograms.strides();
+        input_t *hist_local_data = thread_histograms.data_ptr<input_t>();
+
+        // View only this thread's local results
+        hist_local_data += hist_strides[0] * tid;
+        hist_strides = hist_strides.slice(1);
+
+        for (const auto i : c10::irange(start, end)) {
+            bool skip_elt = false;
+            int64_t hist_index = 0;
+
+            for (const auto dim : c10::irange(D)) {
+                const input_t elt = accessor_in[i][dim];
+
+                // Skips elements which fall outside the specified bins and NaN elements
+                if (!(elt >= leftmost_edge[dim] && elt <= rightmost_edge[dim])) {
+                    skip_elt = true;
+                    break;
+                }
+
+                int64_t pos = -1;
+
+                if (algorithm == BINARY_SEARCH) {
+                    // Handles the general case via binary search on the bin edges.
+                    pos = std::upper_bound(bin_seq[dim], bin_seq[dim] + num_bin_edges[dim], elt)
+                            - bin_seq[dim] - 1;
+                } else if (algorithm == LINEAR_INTERPOLATION
+                        || algorithm == LINEAR_INTERPOLATION_WITH_LOCAL_SEARCH) {
+                    /* When bin_edges is known to be a linear progression, maps elt to
+                     * the appropriate bin via simple division.
+                     */
+                    pos = static_cast<int64_t>((elt - leftmost_edge[dim])
+                            * (num_bin_edges[dim] - 1)
+                            / (rightmost_edge[dim] - leftmost_edge[dim]));
+
+                    /* Ensures consistency with bin_edges by checking the bins to the left and right
+                     * of the selected position. Necessary for cases in which an element very close
+                     * to a bin edge may be misclassified by simple division.
+                     */
+                    if (algorithm == LINEAR_INTERPOLATION_WITH_LOCAL_SEARCH) {
+                        int64_t pos_min = std::max(static_cast<int64_t>(0), pos - 1);
+                        int64_t pos_max = std::min(pos + 2, num_bin_edges[dim]);
+                        pos = std::upper_bound(bin_seq[dim] + pos_min, bin_seq[dim] + pos_max, elt)
+                                - bin_seq[dim] - 1;
+                    }
+                } else {
+                    TORCH_INTERNAL_ASSERT(false);
+                }
+
+                // Unlike other bins, the rightmost bin includes its right boundary
+                if (pos == (num_bin_edges[dim] - 1)) {
+                    pos -= 1;
+                }
+
+                hist_index += hist_strides[dim] * pos;
+            }
+
+            if (!skip_elt) {
+                // In the unweighted case, the default weight is 1
+                input_t wt = accessor_wt.has_value() ? accessor_wt.value()[i] : static_cast<input_t>(1);
+
+                hist_local_data[hist_index] += wt;
+            }
+        }
+    });
+
+    at::sum_out(hist, thread_histograms, /*dim=*/{0});
+}
+```
+
 
 
 

From 1e2562f3acd4acd360f0e69d696b7a14aa6b8c83 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Sat, 7 Oct 2023 18:49:49 +0800
Subject: [PATCH 15/16] add test path, fix args desc

---
 rfcs/APIs/20231001_api_design_for_histogramdd.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md
index 05bf2b087..29507448e 100644
--- a/rfcs/APIs/20231001_api_design_for_histogramdd.md
+++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md
@@ -381,14 +381,16 @@ PyTorch底层用cpp实现kernel，Numpy通过API在Python层直接实现。
 
 API的设计为:
 
-- paddle.histogramdd(sample, bins, range=None, density=None, weights=None，name=None)
+- paddle.histogramdd(sample, bins, range=None, density=False, weights=None，name=None)
 
 其中
 
 + sample(Tensor) - 输入的多维 tensor
 + bins(Tensor[], int[], int) 若为`Tensor[]`，则定义了bin的边缘序列；若为`int[]`，则每个值分别定义了每个维度的等宽bin的数量；若为`int`，则定义了所有维度的等宽bin的数量。
 + range(*sequence of python:float*)：规定了bin的最左端和最右端，也就是范围。若为None则以所有输入的最小值和最大值作为边界。
++ density (bool) – 默认为 False , 结果将包含每个bin中的计数。如果设置为 True ，则每个计数（重量）将除以总计数，然后除以其所在bin的范围宽度。
 + weight(Tensor): 默认所有输入权重为1，他的shape必须与输入sample除去最内部维度的shape相同，例如当sample的shape为[M,N]时，weight的shape必须为[M]。
++ name(str, 可选）- 操作的名称(默认值为None）。
 
 ## 底层OP设计
 
@@ -402,6 +404,8 @@ API的设计为:
 
 # 六、测试和验收的考量
 
+单测代码位置，Paddle repo 的 test/legacy_test/test_histogramdd_op目录
+
 测试考虑的case如下：
 
 + 验证正确性，与PyTorch中的API计算结果进行对比。

From 04ff24163b4bef0a13e4e24a5f6f6b79fda5c654 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Mon, 20 Nov 2023 16:21:24 +0800
Subject: [PATCH 16/16] modify params in api: sample to x, range to rangs.

---
 rfcs/APIs/20231001_api_design_for_histogramdd.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md
index 29507448e..52775ab12 100644
--- a/rfcs/APIs/20231001_api_design_for_histogramdd.md
+++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md
@@ -381,13 +381,13 @@ PyTorch底层用cpp实现kernel，Numpy通过API在Python层直接实现。
 
 API的设计为:
 
-- paddle.histogramdd(sample, bins, range=None, density=False, weights=None，name=None)
+- paddle.histogramdd(x, bins, ranges=None, density=False, weights=None，name=None)
 
 其中
 
-+ sample(Tensor) - 输入的多维 tensor
++ x(Tensor) - 输入的多维 tensor
 + bins(Tensor[], int[], int) 若为`Tensor[]`，则定义了bin的边缘序列；若为`int[]`，则每个值分别定义了每个维度的等宽bin的数量；若为`int`，则定义了所有维度的等宽bin的数量。
-+ range(*sequence of python:float*)：规定了bin的最左端和最右端，也就是范围。若为None则以所有输入的最小值和最大值作为边界。
++ ranges(*sequence of python:float*)：规定了bin的最左端和最右端，也就是范围。若为None则以所有输入的最小值和最大值作为边界。
 + density (bool) – 默认为 False , 结果将包含每个bin中的计数。如果设置为 True ，则每个计数（重量）将除以总计数，然后除以其所在bin的范围宽度。
 + weight(Tensor): 默认所有输入权重为1，他的shape必须与输入sample除去最内部维度的shape相同，例如当sample的shape为[M,N]时，weight的shape必须为[M]。
 + name(str, 可选）- 操作的名称(默认值为None）。