From 7d2e856ec0d1af7fb5b0cbe561574cab7ebb6db9 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 14 Sep 2023 02:52:59 +0800 Subject: [PATCH 01/16] add copysign rfc --- rfcs/APIs/20230914_api_design_for_copysign.md | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 rfcs/APIs/20230914_api_design_for_copysign.md diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md new file mode 100644 index 000000000..d320f5efc --- /dev/null +++ b/rfcs/APIs/20230914_api_design_for_copysign.md @@ -0,0 +1,257 @@ +# paddle.copysign 设计文档 + +| API 名称 | paddle.copysign | +| ------------ | -------------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-09-14 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20230914_api_defign_for_copysign | + +# 一、概述 + +## 1、相关背景 + +为了提升飞桨API丰富度,Paddle需要扩充API,调用路径为: + +- paddle.copysign 作为独立的函数调用,非 inplace +- paddle.copysign_,作为独立的函数,inplace 地修改输入; +- Tensor.copysign做为 Tensor 的方法使用,非 inplace; +- Tensor.copysign_做为 Tensor 的方法使用, inplace 修改输入; + +## 2、功能目标 + +根据两个输入逐元素地计算结果张量,其结果由第一个输入的绝对值大小及第二个输入的符号组成。 + +## 3、意义 + +飞桨支持直接通过张量进行批量正负符号复制 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## PyTorch + +PyTorch中有API `torch.copysign(input, other, *, out=None) → [Tensor]` 以及对应的`torch.Tensor.copysign` + +在PyTorch中介绍为: + +``` +Create a new floating-point tensor with the magnitude of input and the sign of other, elementwise. + +Supports broadcasting to a common shape, and integer and float inputs. +``` + +## 实现方法 + +从实现方法上,PyTorch是通过c++实现的,[CPU kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L1148-L1158) + +```cpp +void copysign_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() { + cpu_kernel_vec(iter, + [](scalar_t a, scalar_t b) -> scalar_t { + return c10::copysign(a, b); + }, + [](Vectorized a, Vectorized b) -> Vectorized { + return a.copysign(b); + }); + }); +} +``` + +在c10 namespace中,[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/util/copysign.h#L12-L15): + +```cpp +namespace c10 { + +// Note: Explicit implementation of copysign for Half and BFloat16 +// is needed to workaround g++-7/8 crash on aarch64, but also makes +// copysign faster for the half-precision types +template +inline auto copysign(const T& a, const U& b) { + return std::copysign(a, b); +} +... +} // namespace c10 +``` + +[cuda kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/CopysignKernel.cu#L23-L29) + +```cpp +namespace at::native { + +void copysign_kernel_cuda(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cuda", [&]() { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return c10::cuda::compat::copysign(a, b); + }); + }); +} + +REGISTER_DISPATCH(copysign_stub, ©sign_kernel_cuda); + +} // namespace at::native +``` + +namespace中的`copysign`调用,[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDAMathCompat.h#L46-L65) + +```cpp +__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysignf(x, y); +#else + // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64 + // (e.g. Jetson), see PyTorch PR #51834 + // This host function needs to be here for the compiler but is never used + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} +__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysign(x, y); +#else + // see above + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} +``` + +方法都是底层cpp调用copysign函数 + + + +**反向backward:** + +算子配置[代码位置](https://github.com/pytorch/pytorch/blob/main/tools/autograd/derivatives.yaml#L474-L481C28) + +```yaml +- name: copysign.Tensor(Tensor self, Tensor other) -> Tensor + self: copysign_tensor_self_backward(grad, self, result) + other: zeros_like(other) + result: copysign_tensor_self_backward(self_t, self_p, result) + +- name: copysign.Scalar(Tensor self, Scalar other) -> Tensor + self: copysign_tensor_self_backward(grad, self, result) + result: auto_element_wise +``` + +backward 反向[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/csrc/autograd/FunctionsManual.cpp#L94-L101) + +```cpp +Tensor copysign_tensor_self_backward( + const Tensor& grad, + const Tensor& self, + const Tensor& result) { + auto ratio = result / self; + ratio.masked_fill_(self == 0, 0); + return grad * ratio; +} +``` + +## TensorFlow + +无`copysign`实现 + +## Numpy + +numpy.**copysign**(*x1*, *x2*, */*, *out=None*, ***, *where=True*, *casting='same_kind'*, *order='K'*, *dtype=None*, *subok=True*[, *signature*, *extobj*]) *= * + +Change the sign of x1 to that of x2, element-wise.If *x2* is a scalar, its sign will be copied to all elements of *x1*. + +### 实现方法 + +先模板生成函数,底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/src/umath/loops.c.src#L1213-L1221) + +``` +NPY_NO_EXPORT void +@TYPE@_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1)= npy_copysign@c@(in1, in2); + } +} +``` + +实际调用cpp的math库[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/include/numpy/npy_math.h#L199) + +```cpp +#include + +... +#define npy_copysign copysign +... +``` + + + +# 四、对比分析 + +PyTorch和Numpy实现方式基本一致,都是底层调用cpp的math库实现`copysign`,PyTorch可进行backward。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为: + +- paddle.copysign(x, y) 作为独立的函数调用,非 inplace +- paddle.copysign_(x, y),作为独立的函数,inplace 地修改输入; +- Tensor.copysign(y)做为 Tensor 的方法使用,非 inplace; +- Tensor.copysign_(y)做为 Tensor 的方法使用, inplace 修改输入; + +其中 + ++ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor ++ y(Tensor, int, float 等 number) + +## 底层OP设计 + +参考PyTorch与Numpy中的设计,调用底层cpp实现OP + +## API实现方案 + +1. 配置算子的yaml,注意配置inplace +2. 实现`CopySignInferMeta`,在调用kernel之前计算好`out`的`shape`和`dtype` +3. 实现`CopySignKernel`的CPU和GPU代码以及forward、backward +4. 封装Python的API,支持动态图和静态图,编写文档 +5. 编写单测 + +# 六、测试和验收的考量 + +测试考虑的case如下: + ++ **编程范式场景**:常规覆盖动态图和静态图的测试场景 + ++ **硬件场景**:常规需覆盖 CPU、GPU 两种测试场景 ++ **参数组合场景**:常规覆盖 API 的全部入参,需要对全部入参进行参数有效性和边界值测试,同时可选参数也需有相应的测试覆盖 ++ **计算精度**:需要保证前向计算、反向计算的精度正确性 + + 前向计算:通过 numpy 实现的函数的对比结果 + + 反向计算:通过 numpy 推导,计算反向结果的正确性 ++ **维度测试**:Paddle API 支持的最低维度为 0 维,单测中应编写相应的 0 维尺寸测试 case ++ **边界测试**:y为0、+0、-0时,测试与numpy结果的一致性 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.copysign.html?highlight=copysign#torch.copysign) + +[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.copysign.html#numpy-copysign) \ No newline at end of file From 4c0022db6fef9cd3c0029f630a265bc9ac64caed Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Fri, 15 Sep 2023 12:02:14 +0800 Subject: [PATCH 02/16] fix input args, add backward kernel, fix python api --- rfcs/APIs/20230914_api_design_for_copysign.md | 50 ++++++++++++++++--- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md index d320f5efc..68a47cb3e 100644 --- a/rfcs/APIs/20230914_api_design_for_copysign.md +++ b/rfcs/APIs/20230914_api_design_for_copysign.md @@ -202,19 +202,55 @@ PyTorch和Numpy实现方式基本一致,都是底层调用cpp的math库实现` API的设计为: -- paddle.copysign(x, y) 作为独立的函数调用,非 inplace -- paddle.copysign_(x, y),作为独立的函数,inplace 地修改输入; -- Tensor.copysign(y)做为 Tensor 的方法使用,非 inplace; -- Tensor.copysign_(y)做为 Tensor 的方法使用, inplace 修改输入; +- paddle.copysign(x, y, name=None) 作为独立的函数调用,非 inplace; +- paddle.copysign_(x, y, name=None),作为独立的函数,inplace 地修改输入; +- Tensor.copysign(y, name=None)做为 Tensor 的方法使用,非 inplace; +- Tensor.copysign_(y, name=None)做为 Tensor 的方法使用, inplace 修改输入; 其中 -+ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor -+ y(Tensor, int, float 等 number) ++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64` ++ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape;为 Number 时,支持 `int32`、`int64`、`float32`、`float64` ## 底层OP设计 -参考PyTorch与Numpy中的设计,调用底层cpp实现OP +参考PyTorch与Numpy中的设计,调用底层cpp实现OP,反向 kernel impl 大致如下: + +```cpp +template +struct CopySignGradFunctor { + CopySignGradFunctor(const T* x_data, const T* y_data, const T* dout, T* dx, int64_t numel) + : x_data_(x_data), y_data_(y_data), dout_(dout), dx_(dx), numel_(numel) {} + + // backward 逻辑如下 + HOSTDEVICE void operator()(int64_t idx) const { + if (x_data_[idx] == T(0)) dx_[idx] = T(0); + else dx_[idx] = T(dout_[idx]) * (T(std::copysign(x_data_[idx], y_data_[idx]) / x_data_[idx])); + } + + const T* x_data_; + const T* y_data_; + const T* dout_; + T* dx_; + int64_t numel_; +}; + +template +void CopySignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto x_data = x.data(), y_data = y.data(), out_grad_data = out_grad.data(); + auto x_grad_data = x_grad->data(); + phi::funcs::ForRange for_range(dev_ctx, x.numel()); + phi::CopySignGradFunctor functor(x_data, y_data, out_grad_data, x_grad_data, x.numel()); + for_range(functor); +} +``` + + ## API实现方案 From 5fff675576d71b8f17f07f1cad7bbd28a9c0412e Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Fri, 15 Sep 2023 14:37:32 +0800 Subject: [PATCH 03/16] fix types --- rfcs/APIs/20230914_api_design_for_copysign.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md index 68a47cb3e..742299e43 100644 --- a/rfcs/APIs/20230914_api_design_for_copysign.md +++ b/rfcs/APIs/20230914_api_design_for_copysign.md @@ -209,8 +209,8 @@ API的设计为: 其中 -+ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64` -+ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape;为 Number 时,支持 `int32`、`int64`、`float32`、`float64` ++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16` ++ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape;为 Number 时,支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16` ## 底层OP设计 From bfa46fec6518b3784294cfd2ad6faaeb3c814f1e Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Fri, 15 Sep 2023 14:56:00 +0800 Subject: [PATCH 04/16] fix Number types --- rfcs/APIs/20230914_api_design_for_copysign.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md index 742299e43..f2a816d4d 100644 --- a/rfcs/APIs/20230914_api_design_for_copysign.md +++ b/rfcs/APIs/20230914_api_design_for_copysign.md @@ -210,7 +210,7 @@ API的设计为: 其中 + x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16` -+ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape;为 Number 时,支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16` ++ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape,支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`;为 Number 时,支持 `bool`、`int`、`float` ## 底层OP设计 From 72fbcfc9ede2814b78780372175eaf35402187b9 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Wed, 27 Sep 2023 06:40:36 +0800 Subject: [PATCH 05/16] add pdist api design --- rfcs/20230926_api_design_for_pdist.md | 409 ++++++++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 rfcs/20230926_api_design_for_pdist.md diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/20230926_api_design_for_pdist.md new file mode 100644 index 000000000..4ffe30875 --- /dev/null +++ b/rfcs/20230926_api_design_for_pdist.md @@ -0,0 +1,409 @@ +# paddle.pdist设计文档 + +| API 名称 | paddle.pdist | +| ------------ | ----------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-09-26 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20230926_api_defign_for_pdist | + +# 一、概述 + +## 1、相关背景 + +为paddle新增该API,为计算N个向量两两之间的p-norm距离。 + +## 2、功能目标 + +一个矩阵`A`的大小为`MxN`,那么`B=pdist(A)`得到的矩阵B的大小为1行`M*(M-1)/2`列,表示的意义是M行数据,每两行计算一下p-norm距离,默认欧式距离。例如a = [[0.0, 1.0],[2.0,3.0],[4.0,5.0],[6.0,7.0]],输出为[2.8284, 5.6569, 8.4853, 2.8284, 5.6569, 2.8284]。输出顺序为distance(第一行,第二行), distance(第一行,第三行), ... distance(第二行,第三行)... + +## 3、意义 + +飞桨支持直接两两计算向量间的距离。 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## Scipy + +Scipy中有API`scipy.spatial.distance.pdist` + +在Scipy中介绍为: + +``` +Pairwise distances between observations in n-dimensional space. +``` + +## 实现方法 + +从实现方法上,Scipy是通过py实现的,[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L2195-L2233) + +```python + X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True, + check_finite=False) + + s = X.shape + if len(s) != 2: + raise ValueError('A 2-dimensional array must be passed.') + + m, n = s + + if callable(metric): + mstr = getattr(metric, '__name__', 'UnknownCustomMetric') + metric_info = _METRIC_ALIAS.get(mstr, None) + + if metric_info is not None: + X, typ, kwargs = _validate_pdist_input( + X, m, n, metric_info, **kwargs) + + return _pdist_callable(X, metric=metric, out=out, **kwargs) + elif isinstance(metric, str): + mstr = metric.lower() + metric_info = _METRIC_ALIAS.get(mstr, None) + + if metric_info is not None: + pdist_fn = metric_info.pdist_func + _extra_windows_error_checks(X, out, (m * (m - 1) / 2,), **kwargs) + return pdist_fn(X, out=out, **kwargs) + elif mstr.startswith("test_"): + metric_info = _TEST_METRICS.get(mstr, None) + if metric_info is None: + raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}') + X, typ, kwargs = _validate_pdist_input( + X, m, n, metric_info, **kwargs) + return _pdist_callable( + X, metric=metric_info.dist_func, out=out, **kwargs) + else: + raise ValueError('Unknown Distance Metric: %s' % mstr) + else: + raise TypeError('2nd argument metric must be a string identifier ' + 'or a function.') +``` + +先找到`mertric`对应的函数,然后call调用,例如`metric`为`euclidean`时,调用`euclidean`的函数。[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L1781C1-L1787C7) + + + +```python + MetricInfo( + canonical_name='euclidean', + aka={'euclidean', 'euclid', 'eu', 'e'}, + dist_func=euclidean, + cdist_func=_distance_pybind.cdist_euclidean, + pdist_func=_distance_pybind.pdist_euclidean, + ), +``` + +[euclidean调用minkowski](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L500-L536)和[minkowski实现](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L429-L497) + +```python +def euclidean(u, v, w=None): + return minkowski(u, v, p=2, w=w) + + +def minkowski(u, v, p=2, w=None): + u = _validate_vector(u) + v = _validate_vector(v) + if p <= 0: + raise ValueError("p must be greater than 0") + u_v = u - v + if w is not None: + w = _validate_weights(w) + if p == 1: + root_w = w + elif p == 2: + # better precision and speed + root_w = np.sqrt(w) + elif p == np.inf: + root_w = (w != 0) + else: + root_w = np.power(w, 1/p) + u_v = root_w * u_v + dist = norm(u_v, ord=p) + return dist +``` + +主要是调用`norm`实现计算 + +```python +def norm(x, ord=None, axis=None): + if not issparse(x): + raise TypeError("input is not sparse. use numpy.linalg.norm") + + # Check the default case first and handle it immediately. + if axis is None and ord in (None, 'fro', 'f'): + return _sparse_frobenius_norm(x) + + # Some norms require functions that are not implemented for all types. + x = x.tocsr() + + if axis is None: + axis = (0, 1) + elif not isinstance(axis, tuple): + msg = "'axis' must be None, an integer or a tuple of integers" + try: + int_axis = int(axis) + except TypeError as e: + raise TypeError(msg) from e + if axis != int_axis: + raise TypeError(msg) + axis = (int_axis,) + + nd = 2 + if len(axis) == 2: + row_axis, col_axis = axis + if not (-nd <= row_axis < nd and -nd <= col_axis < nd): + raise ValueError('Invalid axis %r for an array with shape %r' % + (axis, x.shape)) + if row_axis % nd == col_axis % nd: + raise ValueError('Duplicate axes given.') + if ord == 2: + # Only solver="lobpcg" supports all numpy dtypes + _, s, _ = svds(x, k=1, solver="lobpcg") + return s[0] + elif ord == -2: + raise NotImplementedError + #return _multi_svd_norm(x, row_axis, col_axis, amin) + elif ord == 1: + return abs(x).sum(axis=row_axis).max(axis=col_axis)[0,0] + elif ord == np.inf: + return abs(x).sum(axis=col_axis).max(axis=row_axis)[0,0] + elif ord == -1: + return abs(x).sum(axis=row_axis).min(axis=col_axis)[0,0] + elif ord == -np.inf: + return abs(x).sum(axis=col_axis).min(axis=row_axis)[0,0] + elif ord in (None, 'f', 'fro'): + # The axis order does not matter for this norm. + return _sparse_frobenius_norm(x) + else: + raise ValueError("Invalid norm order for matrices.") + elif len(axis) == 1: + a, = axis + if not (-nd <= a < nd): + raise ValueError('Invalid axis %r for an array with shape %r' % + (axis, x.shape)) + if ord == np.inf: + M = abs(x).max(axis=a) + elif ord == -np.inf: + M = abs(x).min(axis=a) + elif ord == 0: + # Zero norm + M = (x != 0).sum(axis=a) + elif ord == 1: + # special case for speedup + M = abs(x).sum(axis=a) + elif ord in (2, None): + M = sqrt(abs(x).power(2).sum(axis=a)) + else: + try: + ord + 1 + except TypeError as e: + raise ValueError('Invalid norm order for vectors.') from e + M = np.power(abs(x).power(ord).sum(axis=a), 1 / ord) + if hasattr(M, 'toarray'): + return M.toarray().ravel() + elif hasattr(M, 'A'): + return M.A.ravel() + else: + return M.ravel() + else: + raise ValueError("Improper number of dimensions to norm.") +``` + + + + + + + + + +## PyTorch + +Parameters: + +- **input** – input tensor of shape N×M. +- **p** – p value for the p-norm distance to calculate between each vector pair ∈[0,∞]∈[0,∞]. + +并且有相关描述: + +This function is equivalent to `scipy.spatial.distance.pdist(input, 'minkowski', p=p)` if p∈(0,∞). When p=0 it is equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`. When p=∞, the closest scipy function is `scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`. + + + +相关[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L58-L64) + +```cpp +Tensor pdist(const Tensor& self, const double p) { + TORCH_CHECK(self.dim() == 2, + "pdist only supports 2D tensors, got: ", self.dim(), "D"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes"); + TORCH_CHECK(p >= 0, "pdist only supports non-negative p values"); + return at::_pdist_forward(self.contiguous(), p); +} +``` + +调用`_pdist_forward`,[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L244-L262) + +```cpp +Tensor _pdist_forward(const Tensor& self, const double p) { + TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input"); + auto device = self.device().type(); + TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device); + Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT); + if (self.size(0) <= 1) { + result.resize_({0}); + } else { + int64_t n = self.size(0); + int64_t c = n * (n - 1) / 2; + result.resize_({c}); + if (self.size(1) == 0) { + result.fill_(0); + } else { + pdist_forward_stub(device, result, self, p); + } + } + return result; +} +``` + +主要调用`pdist_forward_stub`,绑定了具体的`pdist_forward_kernel_impl` + +```cpp +REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl); +``` + +([CPU](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L446)和[CUDA](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cuda/DistanceKernel.cu#L360)实现绑定了同一个`pdist_forward_kernel_impl`) + +而后`pdist_forward_kernel_impl`的[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L419C1-L423C2) + +```cpp +void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) { + AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] { + Dist::apply_pdist(result, self, p); + }); +} +``` + +调用`apply_pdist`,[代码位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L190-L202) + +```cpp + // Assumes self is nonempty, contiguous, and 2D + static void apply_pdist(Tensor& result, const Tensor& self, const scalar_t p) { + if (p == 0.0) { + run_parallel_pdist>(result, self, p); + } else if (p == 1.0) { + run_parallel_pdist>(result, self, p); + } else if (p == 2.0) { + run_parallel_pdist>(result, self, p); + } else if (std::isinf(p)) { + run_parallel_pdist>(result, self, p); + } else { + run_parallel_pdist>(result, self, p); + } + } +``` + +`run_parallel_pdist`具体实现 + +```cpp + template + static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) { + const scalar_t * const self_start = self.data_ptr(); + const scalar_t * const self_end = self_start + self.numel(); + int64_t n = self.size(0); + int64_t m = self.size(1); + + scalar_t * const res_start = result.data_ptr(); + int64_t combs = result.numel(); // n * (n - 1) / 2 + + // We conceptually iterate over tuples of (i, j, k) where i is the first + // vector from the input, j is the second, and k is the result index. This + // parallelizes over the range of k and infers what i and j are from the + // value of k. + parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) { + const Vec pvec(p); + double n2 = n - .5; + // The -1 accounts for floating point truncation issues + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + int64_t i = static_cast((n2 - std::sqrt(n2 * n2 - 2 * k - 1))); + int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; + + const scalar_t * self_i = self_start + i * m; + const scalar_t * self_j = self_start + j * m; + scalar_t * res = res_start + k; + const scalar_t * const res_end = res_start + end; + + while (res != res_end) { + *res = F::finish(vec::map2_reduce_all( + [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); }, + F::red, self_i, self_j, m), p); + + res += 1; + self_j += m; + if (self_j == self_end) { + self_i += m; + self_j = self_i + m; + } + } + }); + } +``` + + + +# 四、对比分析 + +Scipy利用现有API组合实现,PyTorch则在底层重写cpp算子。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为paddle.cdist(x, y, p=2.0),其中 `x` 严格为 shape=[M, N] 的 Tensor,`p` 为p-范数对应的p值,输出为一行 `Mx(M-1)/2` 列的 Tensor + +## API实现方案 + +参考`Paddle.cdist`和与`Scipy`中的设计,组合已有API实现功能 + +# 六、测试和验收的考量 + +测试考虑的case如下: + +1. 当`x`、`y` 2D 的 Tensor,并如PyTorch给出合理提示 + + ```python + >>> a = [] + >>> a = torch.tensor(a) + >>> b = torch.nn.functional.pdist(a) + Traceback (most recent call last): + File "", line 1, in + RuntimeError: pdist only supports 2D tensors, got: 1D + >>> b + ``` + + + +2. 结果一致性,和 SciPy 以及 PyTorch 结果的数值的一致性 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.nn.functional.pdist.html?highlight=pdist#torch.nn.functional.pdist) + +[Scipy文档](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html) \ No newline at end of file From 9d2918be55daeb72b367b116ae266d6827e2c83c Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Wed, 27 Sep 2023 16:42:51 +0800 Subject: [PATCH 06/16] fix typo --- rfcs/{ => APIs}/20230926_api_design_for_pdist.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename rfcs/{ => APIs}/20230926_api_design_for_pdist.md (98%) diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md similarity index 98% rename from rfcs/20230926_api_design_for_pdist.md rename to rfcs/APIs/20230926_api_design_for_pdist.md index 4ffe30875..ee64c3148 100644 --- a/rfcs/20230926_api_design_for_pdist.md +++ b/rfcs/APIs/20230926_api_design_for_pdist.md @@ -364,11 +364,11 @@ Scipy利用现有API组合实现,PyTorch则在底层重写cpp算子。 ## 命名与参数设计 -API的设计为paddle.cdist(x, y, p=2.0),其中 `x` 严格为 shape=[M, N] 的 Tensor,`p` 为p-范数对应的p值,输出为一行 `Mx(M-1)/2` 列的 Tensor +API的设计为paddle.pdist(x, p=2.0),其中 `x` 严格为 shape=[M, N] 的 Tensor,`p` 为p-范数对应的p值,输出为一行 `Mx(M-1)/2` 列的 Tensor ## API实现方案 -参考`Paddle.cdist`和与`Scipy`中的设计,组合已有API实现功能 +参考`Paddle.pdist`和与`Scipy`中的设计,组合已有API实现功能 # 六、测试和验收的考量 From 3b3e350c49fe09b99f96cee32d4e2e46c5f4e2c7 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 06:39:54 +0800 Subject: [PATCH 07/16] fix --- rfcs/APIs/20230926_api_design_for_pdist.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md index ee64c3148..71b46d1d3 100644 --- a/rfcs/APIs/20230926_api_design_for_pdist.md +++ b/rfcs/APIs/20230926_api_design_for_pdist.md @@ -368,7 +368,7 @@ API的设计为paddle.pdist(x, p=2.0),其中 `x` 严格为 shape=[M, N] 的 Te ## API实现方案 -参考`Paddle.pdist`和与`Scipy`中的设计,组合已有API实现功能 +参考`PyTorch`与`Scipy`中的设计,组合已有API实现功能 # 六、测试和验收的考量 From ab440c29329bba7e83f161da48b8a6183110a3de Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 06:58:23 +0800 Subject: [PATCH 08/16] add bitwise_shift rfc --- .../20230927_api_design_for_bitwise_shift.md | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md new file mode 100644 index 000000000..2f1f6911b --- /dev/null +++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md @@ -0,0 +1,186 @@ +# paddle.pdist设计文档 + +| API 名称 | paddle.bitwise_right_shift
paddle.bitwise_left_shift | +| ------------ | --------------------------------------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-09-27 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20230927_api_defign_for_bitwise_shift | + +# 一、概述 + +## 1、相关背景 + +为paddle新增该API,给 Tensor 做 element wise 的算数(或逻辑)左移/右移。 + +## 2、功能目标 + +通过一个Tensor给定的bits计算另一个Tensor的的算术(或逻辑)右移/左移。 + +## 3、意义 + +飞桨支持直接对Tensor进行元素粒度的左移右移。 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## PyTorch + +PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor` + +介绍为: + +``` +Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion. +``` + +## 实现方法 + +从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149) + +```python +shift_right_arithmetic = _make_elementwise_binary_prim( + "shift_right_arithmetic", + impl_aten=torch.bitwise_right_shift, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) +``` + +具体元素尺度的实现,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405): + +```python +# TODO(fdrocha): this is currently not being used anywhere, +# pending on moving triton pin past 972b761 +@staticmethod +def bitwise_right_shift(x, y): + return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}" +``` + + + +## Numpy + +- Parameters: + + - **x1**:array_like, int + + Input values. + + - **x2**:array_like, int + + Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output). + + - **out**:ndarray, None, or tuple of ndarray and None, optional + + A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs. + + - **where**:array_like, optional + + This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized. + + - **kwargs: + + For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs). + +Returns: + +- **out**:ndarray, int + + Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars. + + + +相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105) + +```cpp +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + @type@ in2 = *(@type@ *)ip2; + *(@type@ *)op1 = npy_rshift@c@(in1, in2); + } +#endif +} +``` + +`npy_rshift`相关调用 + +```cpp +NPY_INPLACE npy_@u@@type@ +npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b) +{ + if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) { + return a >> b; + } +#if @is_signed@ + else if (a < 0) { + return (npy_@u@@type@)-1; /* preserve the sign bit */ + } +#endif + else { + return 0; + } +} +``` + +# 四、对比分析 + +PyTorch是将算子注册到element wise系列中,Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为`paddle.bitwise_right_shift(x, y)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int。 + +## API实现方案 + +参考`PyTorch`和与`Numpy`中的设计,组合已有API实现功能 + +# 六、测试和验收的考量 + +测试考虑的case如下: + +1. 对 `x`、`y`的 shape 和 dtype 有限制,并给出合理提示 + +2. 结果一致性,和 PyTorch、Numpy 结果的数值的一致性 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift) + +[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift) \ No newline at end of file From e3fe25c69c4dd868f98a3474a72afe70c132130a Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 07:24:06 +0800 Subject: [PATCH 09/16] update --- .../20230927_api_design_for_bitwise_shift.md | 186 ------------------ 1 file changed, 186 deletions(-) delete mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md deleted file mode 100644 index 2f1f6911b..000000000 --- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md +++ /dev/null @@ -1,186 +0,0 @@ -# paddle.pdist设计文档 - -| API 名称 | paddle.bitwise_right_shift
paddle.bitwise_left_shift | -| ------------ | --------------------------------------------------------- | -| 提交作者 | coco | -| 提交时间 | 2023-09-27 | -| 版本号 | V1.0 | -| 依赖飞桨版本 | develop | -| 文件名 | 20230927_api_defign_for_bitwise_shift | - -# 一、概述 - -## 1、相关背景 - -为paddle新增该API,给 Tensor 做 element wise 的算数(或逻辑)左移/右移。 - -## 2、功能目标 - -通过一个Tensor给定的bits计算另一个Tensor的的算术(或逻辑)右移/左移。 - -## 3、意义 - -飞桨支持直接对Tensor进行元素粒度的左移右移。 - -# 二、飞桨现状 - -目前paddle缺少相关功能实现。 - -# 三、业内方案调研 - -## PyTorch - -PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor` - -介绍为: - -``` -Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion. -``` - -## 实现方法 - -从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149) - -```python -shift_right_arithmetic = _make_elementwise_binary_prim( - "shift_right_arithmetic", - impl_aten=torch.bitwise_right_shift, - doc="", - type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, -) -``` - -具体元素尺度的实现,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405): - -```python -# TODO(fdrocha): this is currently not being used anywhere, -# pending on moving triton pin past 972b761 -@staticmethod -def bitwise_right_shift(x, y): - return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}" -``` - - - -## Numpy - -- Parameters: - - - **x1**:array_like, int - - Input values. - - - **x2**:array_like, int - - Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output). - - - **out**:ndarray, None, or tuple of ndarray and None, optional - - A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs. - - - **where**:array_like, optional - - This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized. - - - **kwargs: - - For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs). - -Returns: - -- **out**:ndarray, int - - Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars. - - - -相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105) - -```cpp -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) -(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -#else - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - @type@ in2 = *(@type@ *)ip2; - *(@type@ *)op1 = npy_rshift@c@(in1, in2); - } -#endif -} -``` - -`npy_rshift`相关调用 - -```cpp -NPY_INPLACE npy_@u@@type@ -npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b) -{ - if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) { - return a >> b; - } -#if @is_signed@ - else if (a < 0) { - return (npy_@u@@type@)-1; /* preserve the sign bit */ - } -#endif - else { - return 0; - } -} -``` - -# 四、对比分析 - -PyTorch是将算子注册到element wise系列中,Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。 - -# 五、设计思路与实现方案 - -## 命名与参数设计 - -API的设计为`paddle.bitwise_right_shift(x, y)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int。 - -## API实现方案 - -参考`PyTorch`和与`Numpy`中的设计,组合已有API实现功能 - -# 六、测试和验收的考量 - -测试考虑的case如下: - -1. 对 `x`、`y`的 shape 和 dtype 有限制,并给出合理提示 - -2. 结果一致性,和 PyTorch、Numpy 结果的数值的一致性 - -# 七、可行性分析及规划排期 - -有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 - -# 八、影响面 - -为独立新增API,对其他模块没有影响 - -# 名词解释 - -无 - -# 附件及参考资料 - -[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift) - -[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift) \ No newline at end of file From 68d27cf599fae8f826bd1efd76fc2e23b545388f Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 07:25:16 +0800 Subject: [PATCH 10/16] add bitwise rfc --- .../20230927_api_design_for_bitwise_shift.md | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md new file mode 100644 index 000000000..2f1f6911b --- /dev/null +++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md @@ -0,0 +1,186 @@ +# paddle.pdist设计文档 + +| API 名称 | paddle.bitwise_right_shift
paddle.bitwise_left_shift | +| ------------ | --------------------------------------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-09-27 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20230927_api_defign_for_bitwise_shift | + +# 一、概述 + +## 1、相关背景 + +为paddle新增该API,给 Tensor 做 element wise 的算数(或逻辑)左移/右移。 + +## 2、功能目标 + +通过一个Tensor给定的bits计算另一个Tensor的的算术(或逻辑)右移/左移。 + +## 3、意义 + +飞桨支持直接对Tensor进行元素粒度的左移右移。 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## PyTorch + +PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor` + +介绍为: + +``` +Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion. +``` + +## 实现方法 + +从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149) + +```python +shift_right_arithmetic = _make_elementwise_binary_prim( + "shift_right_arithmetic", + impl_aten=torch.bitwise_right_shift, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) +``` + +具体元素尺度的实现,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405): + +```python +# TODO(fdrocha): this is currently not being used anywhere, +# pending on moving triton pin past 972b761 +@staticmethod +def bitwise_right_shift(x, y): + return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}" +``` + + + +## Numpy + +- Parameters: + + - **x1**:array_like, int + + Input values. + + - **x2**:array_like, int + + Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output). + + - **out**:ndarray, None, or tuple of ndarray and None, optional + + A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs. + + - **where**:array_like, optional + + This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized. + + - **kwargs: + + For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs). + +Returns: + +- **out**:ndarray, int + + Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars. + + + +相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105) + +```cpp +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + @type@ in2 = *(@type@ *)ip2; + *(@type@ *)op1 = npy_rshift@c@(in1, in2); + } +#endif +} +``` + +`npy_rshift`相关调用 + +```cpp +NPY_INPLACE npy_@u@@type@ +npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b) +{ + if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) { + return a >> b; + } +#if @is_signed@ + else if (a < 0) { + return (npy_@u@@type@)-1; /* preserve the sign bit */ + } +#endif + else { + return 0; + } +} +``` + +# 四、对比分析 + +PyTorch是将算子注册到element wise系列中,Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为`paddle.bitwise_right_shift(x, y)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int。 + +## API实现方案 + +参考`PyTorch`和与`Numpy`中的设计,组合已有API实现功能 + +# 六、测试和验收的考量 + +测试考虑的case如下: + +1. 对 `x`、`y`的 shape 和 dtype 有限制,并给出合理提示 + +2. 结果一致性,和 PyTorch、Numpy 结果的数值的一致性 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift) + +[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift) \ No newline at end of file From 7aff2ebd0a96323128886edc4a46b13c90278d9c Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 07:29:15 +0800 Subject: [PATCH 11/16] update --- rfcs/APIs/20230926_api_design_for_pdist.md | 409 --------------------- 1 file changed, 409 deletions(-) delete mode 100644 rfcs/APIs/20230926_api_design_for_pdist.md diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md deleted file mode 100644 index 71b46d1d3..000000000 --- a/rfcs/APIs/20230926_api_design_for_pdist.md +++ /dev/null @@ -1,409 +0,0 @@ -# paddle.pdist设计文档 - -| API 名称 | paddle.pdist | -| ------------ | ----------------------------- | -| 提交作者 | coco | -| 提交时间 | 2023-09-26 | -| 版本号 | V1.0 | -| 依赖飞桨版本 | develop | -| 文件名 | 20230926_api_defign_for_pdist | - -# 一、概述 - -## 1、相关背景 - -为paddle新增该API,为计算N个向量两两之间的p-norm距离。 - -## 2、功能目标 - -一个矩阵`A`的大小为`MxN`,那么`B=pdist(A)`得到的矩阵B的大小为1行`M*(M-1)/2`列,表示的意义是M行数据,每两行计算一下p-norm距离,默认欧式距离。例如a = [[0.0, 1.0],[2.0,3.0],[4.0,5.0],[6.0,7.0]],输出为[2.8284, 5.6569, 8.4853, 2.8284, 5.6569, 2.8284]。输出顺序为distance(第一行,第二行), distance(第一行,第三行), ... distance(第二行,第三行)... - -## 3、意义 - -飞桨支持直接两两计算向量间的距离。 - -# 二、飞桨现状 - -目前paddle缺少相关功能实现。 - -# 三、业内方案调研 - -## Scipy - -Scipy中有API`scipy.spatial.distance.pdist` - -在Scipy中介绍为: - -``` -Pairwise distances between observations in n-dimensional space. -``` - -## 实现方法 - -从实现方法上,Scipy是通过py实现的,[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L2195-L2233) - -```python - X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True, - check_finite=False) - - s = X.shape - if len(s) != 2: - raise ValueError('A 2-dimensional array must be passed.') - - m, n = s - - if callable(metric): - mstr = getattr(metric, '__name__', 'UnknownCustomMetric') - metric_info = _METRIC_ALIAS.get(mstr, None) - - if metric_info is not None: - X, typ, kwargs = _validate_pdist_input( - X, m, n, metric_info, **kwargs) - - return _pdist_callable(X, metric=metric, out=out, **kwargs) - elif isinstance(metric, str): - mstr = metric.lower() - metric_info = _METRIC_ALIAS.get(mstr, None) - - if metric_info is not None: - pdist_fn = metric_info.pdist_func - _extra_windows_error_checks(X, out, (m * (m - 1) / 2,), **kwargs) - return pdist_fn(X, out=out, **kwargs) - elif mstr.startswith("test_"): - metric_info = _TEST_METRICS.get(mstr, None) - if metric_info is None: - raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}') - X, typ, kwargs = _validate_pdist_input( - X, m, n, metric_info, **kwargs) - return _pdist_callable( - X, metric=metric_info.dist_func, out=out, **kwargs) - else: - raise ValueError('Unknown Distance Metric: %s' % mstr) - else: - raise TypeError('2nd argument metric must be a string identifier ' - 'or a function.') -``` - -先找到`mertric`对应的函数,然后call调用,例如`metric`为`euclidean`时,调用`euclidean`的函数。[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L1781C1-L1787C7) - - - -```python - MetricInfo( - canonical_name='euclidean', - aka={'euclidean', 'euclid', 'eu', 'e'}, - dist_func=euclidean, - cdist_func=_distance_pybind.cdist_euclidean, - pdist_func=_distance_pybind.pdist_euclidean, - ), -``` - -[euclidean调用minkowski](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L500-L536)和[minkowski实现](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L429-L497) - -```python -def euclidean(u, v, w=None): - return minkowski(u, v, p=2, w=w) - - -def minkowski(u, v, p=2, w=None): - u = _validate_vector(u) - v = _validate_vector(v) - if p <= 0: - raise ValueError("p must be greater than 0") - u_v = u - v - if w is not None: - w = _validate_weights(w) - if p == 1: - root_w = w - elif p == 2: - # better precision and speed - root_w = np.sqrt(w) - elif p == np.inf: - root_w = (w != 0) - else: - root_w = np.power(w, 1/p) - u_v = root_w * u_v - dist = norm(u_v, ord=p) - return dist -``` - -主要是调用`norm`实现计算 - -```python -def norm(x, ord=None, axis=None): - if not issparse(x): - raise TypeError("input is not sparse. use numpy.linalg.norm") - - # Check the default case first and handle it immediately. - if axis is None and ord in (None, 'fro', 'f'): - return _sparse_frobenius_norm(x) - - # Some norms require functions that are not implemented for all types. - x = x.tocsr() - - if axis is None: - axis = (0, 1) - elif not isinstance(axis, tuple): - msg = "'axis' must be None, an integer or a tuple of integers" - try: - int_axis = int(axis) - except TypeError as e: - raise TypeError(msg) from e - if axis != int_axis: - raise TypeError(msg) - axis = (int_axis,) - - nd = 2 - if len(axis) == 2: - row_axis, col_axis = axis - if not (-nd <= row_axis < nd and -nd <= col_axis < nd): - raise ValueError('Invalid axis %r for an array with shape %r' % - (axis, x.shape)) - if row_axis % nd == col_axis % nd: - raise ValueError('Duplicate axes given.') - if ord == 2: - # Only solver="lobpcg" supports all numpy dtypes - _, s, _ = svds(x, k=1, solver="lobpcg") - return s[0] - elif ord == -2: - raise NotImplementedError - #return _multi_svd_norm(x, row_axis, col_axis, amin) - elif ord == 1: - return abs(x).sum(axis=row_axis).max(axis=col_axis)[0,0] - elif ord == np.inf: - return abs(x).sum(axis=col_axis).max(axis=row_axis)[0,0] - elif ord == -1: - return abs(x).sum(axis=row_axis).min(axis=col_axis)[0,0] - elif ord == -np.inf: - return abs(x).sum(axis=col_axis).min(axis=row_axis)[0,0] - elif ord in (None, 'f', 'fro'): - # The axis order does not matter for this norm. - return _sparse_frobenius_norm(x) - else: - raise ValueError("Invalid norm order for matrices.") - elif len(axis) == 1: - a, = axis - if not (-nd <= a < nd): - raise ValueError('Invalid axis %r for an array with shape %r' % - (axis, x.shape)) - if ord == np.inf: - M = abs(x).max(axis=a) - elif ord == -np.inf: - M = abs(x).min(axis=a) - elif ord == 0: - # Zero norm - M = (x != 0).sum(axis=a) - elif ord == 1: - # special case for speedup - M = abs(x).sum(axis=a) - elif ord in (2, None): - M = sqrt(abs(x).power(2).sum(axis=a)) - else: - try: - ord + 1 - except TypeError as e: - raise ValueError('Invalid norm order for vectors.') from e - M = np.power(abs(x).power(ord).sum(axis=a), 1 / ord) - if hasattr(M, 'toarray'): - return M.toarray().ravel() - elif hasattr(M, 'A'): - return M.A.ravel() - else: - return M.ravel() - else: - raise ValueError("Improper number of dimensions to norm.") -``` - - - - - - - - - -## PyTorch - -Parameters: - -- **input** – input tensor of shape N×M. -- **p** – p value for the p-norm distance to calculate between each vector pair ∈[0,∞]∈[0,∞]. - -并且有相关描述: - -This function is equivalent to `scipy.spatial.distance.pdist(input, 'minkowski', p=p)` if p∈(0,∞). When p=0 it is equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`. When p=∞, the closest scipy function is `scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`. - - - -相关[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L58-L64) - -```cpp -Tensor pdist(const Tensor& self, const double p) { - TORCH_CHECK(self.dim() == 2, - "pdist only supports 2D tensors, got: ", self.dim(), "D"); - TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes"); - TORCH_CHECK(p >= 0, "pdist only supports non-negative p values"); - return at::_pdist_forward(self.contiguous(), p); -} -``` - -调用`_pdist_forward`,[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L244-L262) - -```cpp -Tensor _pdist_forward(const Tensor& self, const double p) { - TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input"); - auto device = self.device().type(); - TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device); - Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT); - if (self.size(0) <= 1) { - result.resize_({0}); - } else { - int64_t n = self.size(0); - int64_t c = n * (n - 1) / 2; - result.resize_({c}); - if (self.size(1) == 0) { - result.fill_(0); - } else { - pdist_forward_stub(device, result, self, p); - } - } - return result; -} -``` - -主要调用`pdist_forward_stub`,绑定了具体的`pdist_forward_kernel_impl` - -```cpp -REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl); -``` - -([CPU](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L446)和[CUDA](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cuda/DistanceKernel.cu#L360)实现绑定了同一个`pdist_forward_kernel_impl`) - -而后`pdist_forward_kernel_impl`的[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L419C1-L423C2) - -```cpp -void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) { - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] { - Dist::apply_pdist(result, self, p); - }); -} -``` - -调用`apply_pdist`,[代码位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L190-L202) - -```cpp - // Assumes self is nonempty, contiguous, and 2D - static void apply_pdist(Tensor& result, const Tensor& self, const scalar_t p) { - if (p == 0.0) { - run_parallel_pdist>(result, self, p); - } else if (p == 1.0) { - run_parallel_pdist>(result, self, p); - } else if (p == 2.0) { - run_parallel_pdist>(result, self, p); - } else if (std::isinf(p)) { - run_parallel_pdist>(result, self, p); - } else { - run_parallel_pdist>(result, self, p); - } - } -``` - -`run_parallel_pdist`具体实现 - -```cpp - template - static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) { - const scalar_t * const self_start = self.data_ptr(); - const scalar_t * const self_end = self_start + self.numel(); - int64_t n = self.size(0); - int64_t m = self.size(1); - - scalar_t * const res_start = result.data_ptr(); - int64_t combs = result.numel(); // n * (n - 1) / 2 - - // We conceptually iterate over tuples of (i, j, k) where i is the first - // vector from the input, j is the second, and k is the result index. This - // parallelizes over the range of k and infers what i and j are from the - // value of k. - parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) { - const Vec pvec(p); - double n2 = n - .5; - // The -1 accounts for floating point truncation issues - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - int64_t i = static_cast((n2 - std::sqrt(n2 * n2 - 2 * k - 1))); - int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; - - const scalar_t * self_i = self_start + i * m; - const scalar_t * self_j = self_start + j * m; - scalar_t * res = res_start + k; - const scalar_t * const res_end = res_start + end; - - while (res != res_end) { - *res = F::finish(vec::map2_reduce_all( - [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); }, - F::red, self_i, self_j, m), p); - - res += 1; - self_j += m; - if (self_j == self_end) { - self_i += m; - self_j = self_i + m; - } - } - }); - } -``` - - - -# 四、对比分析 - -Scipy利用现有API组合实现,PyTorch则在底层重写cpp算子。 - -# 五、设计思路与实现方案 - -## 命名与参数设计 - -API的设计为paddle.pdist(x, p=2.0),其中 `x` 严格为 shape=[M, N] 的 Tensor,`p` 为p-范数对应的p值,输出为一行 `Mx(M-1)/2` 列的 Tensor - -## API实现方案 - -参考`PyTorch`与`Scipy`中的设计,组合已有API实现功能 - -# 六、测试和验收的考量 - -测试考虑的case如下: - -1. 当`x`、`y` 2D 的 Tensor,并如PyTorch给出合理提示 - - ```python - >>> a = [] - >>> a = torch.tensor(a) - >>> b = torch.nn.functional.pdist(a) - Traceback (most recent call last): - File "", line 1, in - RuntimeError: pdist only supports 2D tensors, got: 1D - >>> b - ``` - - - -2. 结果一致性,和 SciPy 以及 PyTorch 结果的数值的一致性 - -# 七、可行性分析及规划排期 - -有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 - -# 八、影响面 - -为独立新增API,对其他模块没有影响 - -# 名词解释 - -无 - -# 附件及参考资料 - -[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.nn.functional.pdist.html?highlight=pdist#torch.nn.functional.pdist) - -[Scipy文档](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html) \ No newline at end of file From 703a1db361228e0fcc793a083da7640ef94eeb11 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Thu, 28 Sep 2023 21:05:31 +0800 Subject: [PATCH 12/16] update jax shift doc --- .../20230927_api_design_for_bitwise_shift.md | 86 ++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md index 2f1f6911b..376a96fb5 100644 --- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md +++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md @@ -40,7 +40,7 @@ Computes the right arithmetic shift of input by other bits. The input tensor mus ## 实现方法 -从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149) +从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1151) ```python shift_right_arithmetic = _make_elementwise_binary_prim( @@ -49,6 +49,8 @@ shift_right_arithmetic = _make_elementwise_binary_prim( doc="", type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, ) + +shift_right_logical = _not_impl # 可见pytorch中仅支持算数位移 ``` 具体元素尺度的实现,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405): @@ -145,19 +147,97 @@ npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b) } ``` + + +## Jax + +算数位移 + +- jax.lax.**shift_right_arithmetic**(*x*, *y*)[[source\]](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_arithmetic) + + Elementwise arithmetic right shift: x ≫ y. + + Parameters + + - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) + - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) + + Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array) + +具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1748-L1760) + +```python +# Note: Bitwise operations only yield identical results on unsigned integers! +# pylint: disable=protected-access +def _shift_right_arithmetic_raw(x, y): + if x.dtype.is_unsigned: + assert x.dtype == y.dtype + orig_dtype = x.dtype + signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[orig_dtype] + x = tf.cast(x, signed_dtype) + y = tf.cast(y, signed_dtype) + res = tf.bitwise.right_shift(x, y) + return tf.cast(res, orig_dtype) + else: + return tf.bitwise.right_shift(x, y) +``` + +> 在算术位移过程中,如果是有符号数,则直接进行算数位移;如果是无符号数,则先转换成对应的有符号数,再进行位移(因为算术右移时常常需要补符号位,只有有符号数才能保证结果正确性),最后转回无符号数。 + + + +逻辑位移 + ++ jax.lax.**shift_right_logical**(*x*, *y*) [source](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_logical) + + Elementwise logical right shift: �≫�.Parameters + + - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) + - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) + + Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array) + + + +具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1776-L1786) + +```python +def _shift_right_logical_raw(x, y): + if x.dtype.is_unsigned: + return tf.bitwise.right_shift(x, y) + else: + assert x.dtype == y.dtype + orig_dtype = x.dtype + unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[orig_dtype] + x = tf.cast(x, unsigned_dtype) + y = tf.cast(y, unsigned_dtype) + res = tf.bitwise.right_shift(x, y) + return tf.cast(res, orig_dtype) +``` + +> 在逻辑位移过程中,如果是无符号数,则直接进行算数位移;如果是有符号数,则先转换成无符号数,再进行位移(因为逻辑右移时常常需要补0,只有无符号数才能保证结果正确性),最后转回有符号数。 + + + + + # 四、对比分析 PyTorch是将算子注册到element wise系列中,Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。 +同时,PyTorch与Numpy中都仅支持算术位移,不支持逻辑位移,而JAX中实现了算术位移和逻辑位移。 + + + # 五、设计思路与实现方案 ## 命名与参数设计 -API的设计为`paddle.bitwise_right_shift(x, y)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int。 +API的设计为`paddle.bitwise_right_shift(x, y, is_arithmetic=True)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int;`is_arithmetic` 为bool类型,默认为 `True` 表示算术位移,当其为 `False` 时则为逻辑位移。 ## API实现方案 -参考`PyTorch`和与`Numpy`中的设计,组合已有API实现功能 +参考`PyTorch`、`Numpy`、`JAX`中的设计,组合已有API实现功能 # 六、测试和验收的考量 From 23a6b8db5d904860c810cee9c270f27b0ee2cd79 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Mon, 2 Oct 2023 00:23:37 +0800 Subject: [PATCH 13/16] add histogramdd rfc --- .../20230927_api_design_for_bitwise_shift.md | 266 ---------------- .../20231001_api_design_for_histogramdd.md | 294 ++++++++++++++++++ 2 files changed, 294 insertions(+), 266 deletions(-) delete mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md create mode 100644 rfcs/APIs/20231001_api_design_for_histogramdd.md diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md deleted file mode 100644 index 376a96fb5..000000000 --- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md +++ /dev/null @@ -1,266 +0,0 @@ -# paddle.pdist设计文档 - -| API 名称 | paddle.bitwise_right_shift
paddle.bitwise_left_shift | -| ------------ | --------------------------------------------------------- | -| 提交作者 | coco | -| 提交时间 | 2023-09-27 | -| 版本号 | V1.0 | -| 依赖飞桨版本 | develop | -| 文件名 | 20230927_api_defign_for_bitwise_shift | - -# 一、概述 - -## 1、相关背景 - -为paddle新增该API,给 Tensor 做 element wise 的算数(或逻辑)左移/右移。 - -## 2、功能目标 - -通过一个Tensor给定的bits计算另一个Tensor的的算术(或逻辑)右移/左移。 - -## 3、意义 - -飞桨支持直接对Tensor进行元素粒度的左移右移。 - -# 二、飞桨现状 - -目前paddle缺少相关功能实现。 - -# 三、业内方案调研 - -## PyTorch - -PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor` - -介绍为: - -``` -Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion. -``` - -## 实现方法 - -从实现方法上,PyTorch是将位运算注册到element_wise系列中实现的,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1151) - -```python -shift_right_arithmetic = _make_elementwise_binary_prim( - "shift_right_arithmetic", - impl_aten=torch.bitwise_right_shift, - doc="", - type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, -) - -shift_right_logical = _not_impl # 可见pytorch中仅支持算数位移 -``` - -具体元素尺度的实现,[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405): - -```python -# TODO(fdrocha): this is currently not being used anywhere, -# pending on moving triton pin past 972b761 -@staticmethod -def bitwise_right_shift(x, y): - return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}" -``` - - - -## Numpy - -- Parameters: - - - **x1**:array_like, int - - Input values. - - - **x2**:array_like, int - - Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output). - - - **out**:ndarray, None, or tuple of ndarray and None, optional - - A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs. - - - **where**:array_like, optional - - This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized. - - - **kwargs: - - For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs). - -Returns: - -- **out**:ndarray, int - - Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars. - - - -相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105) - -```cpp -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) -(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -#else - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - @type@ in2 = *(@type@ *)ip2; - *(@type@ *)op1 = npy_rshift@c@(in1, in2); - } -#endif -} -``` - -`npy_rshift`相关调用 - -```cpp -NPY_INPLACE npy_@u@@type@ -npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b) -{ - if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) { - return a >> b; - } -#if @is_signed@ - else if (a < 0) { - return (npy_@u@@type@)-1; /* preserve the sign bit */ - } -#endif - else { - return 0; - } -} -``` - - - -## Jax - -算数位移 - -- jax.lax.**shift_right_arithmetic**(*x*, *y*)[[source\]](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_arithmetic) - - Elementwise arithmetic right shift: x ≫ y. - - Parameters - - - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) - - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) - - Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array) - -具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1748-L1760) - -```python -# Note: Bitwise operations only yield identical results on unsigned integers! -# pylint: disable=protected-access -def _shift_right_arithmetic_raw(x, y): - if x.dtype.is_unsigned: - assert x.dtype == y.dtype - orig_dtype = x.dtype - signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[orig_dtype] - x = tf.cast(x, signed_dtype) - y = tf.cast(y, signed_dtype) - res = tf.bitwise.right_shift(x, y) - return tf.cast(res, orig_dtype) - else: - return tf.bitwise.right_shift(x, y) -``` - -> 在算术位移过程中,如果是有符号数,则直接进行算数位移;如果是无符号数,则先转换成对应的有符号数,再进行位移(因为算术右移时常常需要补符号位,只有有符号数才能保证结果正确性),最后转回无符号数。 - - - -逻辑位移 - -+ jax.lax.**shift_right_logical**(*x*, *y*) [source](https://jax.readthedocs.io/en/latest/_modules/jax/_src/lax/lax.html#shift_right_logical) - - Elementwise logical right shift: �≫�.Parameters - - - **x** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) - - **y** ([`Union`](https://docs.python.org/3/library/typing.html#typing.Union)[[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array), [`ndarray`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray), [`bool_`](https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_), [`number`](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.number.html#jax.numpy.number), [`bool`](https://docs.python.org/3/library/functions.html#bool), [`int`](https://docs.python.org/3/library/functions.html#int), [`float`](https://docs.python.org/3/library/functions.html#float), [`complex`](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.complex.html#jax.lax.complex)]) - - Return type[`Array`](https://jax.readthedocs.io/en/latest/_autosummary/jax.Array.html#jax.Array) - - - -具体[实现代码](https://github.com/google/jax/blob/2d068a1caa97ebde662604bb95298cff2abb0afa/jax/experimental/jax2tf/jax2tf.py#L1776-L1786) - -```python -def _shift_right_logical_raw(x, y): - if x.dtype.is_unsigned: - return tf.bitwise.right_shift(x, y) - else: - assert x.dtype == y.dtype - orig_dtype = x.dtype - unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[orig_dtype] - x = tf.cast(x, unsigned_dtype) - y = tf.cast(y, unsigned_dtype) - res = tf.bitwise.right_shift(x, y) - return tf.cast(res, orig_dtype) -``` - -> 在逻辑位移过程中,如果是无符号数,则直接进行算数位移;如果是有符号数,则先转换成无符号数,再进行位移(因为逻辑右移时常常需要补0,只有无符号数才能保证结果正确性),最后转回有符号数。 - - - - - -# 四、对比分析 - -PyTorch是将算子注册到element wise系列中,Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。 - -同时,PyTorch与Numpy中都仅支持算术位移,不支持逻辑位移,而JAX中实现了算术位移和逻辑位移。 - - - -# 五、设计思路与实现方案 - -## 命名与参数设计 - -API的设计为`paddle.bitwise_right_shift(x, y, is_arithmetic=True)`,其余几个shift操作同理,其中 `x` 与 `y` 需要有相同的shape或者能够进行广播,且类型都必须为int;`is_arithmetic` 为bool类型,默认为 `True` 表示算术位移,当其为 `False` 时则为逻辑位移。 - -## API实现方案 - -参考`PyTorch`、`Numpy`、`JAX`中的设计,组合已有API实现功能 - -# 六、测试和验收的考量 - -测试考虑的case如下: - -1. 对 `x`、`y`的 shape 和 dtype 有限制,并给出合理提示 - -2. 结果一致性,和 PyTorch、Numpy 结果的数值的一致性 - -# 七、可行性分析及规划排期 - -有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 - -# 八、影响面 - -为独立新增API,对其他模块没有影响 - -# 名词解释 - -无 - -# 附件及参考资料 - -[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift) - -[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift) \ No newline at end of file diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md new file mode 100644 index 000000000..f1b483367 --- /dev/null +++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md @@ -0,0 +1,294 @@ +# paddle.histogramdd设计文档 + +| API 名称 | paddle.histogramdd | +| ------------ | ----------------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-10-01 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20231001_api_defign_for_histogramdd | + +# 一、概述 + +## 1、相关背景 + +为了提升飞桨API丰富度,Paddle需要扩充API,调用路径为: + +- paddle.histogramdd + +## 2、功能目标 + +实现多维的histogram直方图计算 + +## 3、意义 + +飞桨支持为多维tensor进行直方图计算 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## PyTorch + +PyTorch中有API `torch.histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])` + +## 实现方法 + +从实现方法上,PyTorch是通过c++实现的,[代码位置](https://github.com/pytorch/pytorch/blob/e4414716d55c14d86298d6434e23d47605532cca/aten/src/ATen/native/cpu/HistogramKernel.cpp#L207-L255) + +```cpp +/* Some pre- and post- processing steps for the main algorithm. + * Initializes hist to 0, calls into the main algorithm, and normalizes output if necessary. + */ +template +void histogramdd_out_cpu_template(const Tensor& self, const c10::optional& weight, bool density, + Tensor& hist, const TensorList& bin_edges) { + hist.fill_(0); + + const int64_t N = self.size(-1); + const int64_t M = std::accumulate(self.sizes().begin(), self.sizes().end() - 1, + (int64_t)1, std::multiplies()); + + const Tensor reshaped_input = self.reshape({M, N}); + + const auto reshaped_weight = weight.has_value() + ? c10::optional(weight.value().reshape({M})) + : c10::optional(); + + std::vector bin_edges_contig(bin_edges.size()); + for (const auto dim : c10::irange(bin_edges_contig.size())) { + bin_edges_contig[dim] = bin_edges[dim].contiguous(); + } + + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "histogram_cpu", [&]() { + histogramdd_cpu_contiguous( + hist, bin_edges_contig, reshaped_input, reshaped_weight); + }); + + /* Divides each bin's value by the total count/weight in all bins, + * and by the bin's volume. + */ + if (density) { + const auto hist_sum = hist.sum().item(); + hist.div_(hist_sum); + + /* For each dimension, divides each bin's value + * by the bin's length in that dimension. + */ + for (const auto dim : c10::irange(N)) { + const auto bin_lengths = bin_edges[dim].diff(); + + // Used to reshape bin_lengths to align with the corresponding dimension of hist. + std::vector shape(N, 1); + shape[dim] = bin_lengths.numel(); + + hist.div_(bin_lengths.reshape(shape)); + } + } +} +``` + + + + + +## TensorFlow + +无`histogramdd`实现 + +## Numpy + +numpy.**histogramdd**(*sample*, *bins=10*, *range=None*, *density=None*, *weights=None*)[[source\]](https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/histograms.py#L901-L1072) + +Compute the multidimensional histogram of some data. + +- Parameters: + + **sample**: (N, D) array, or (N, D) array_likeThe data to be histogrammed.Note the unusual interpretation of sample when an array_like:When an array, each row is a coordinate in a D-dimensional space - such as `histogramdd(np.array([p1, p2, p3]))`.When an array_like, each element is the list of values for single coordinate - such as `histogramdd((X, Y, Z))`.The first form should be preferred. + + **bins**: sequence or int, optionalThe bin specification:A sequence of arrays describing the monotonically increasing bin edges along each dimension.The number of bins for each dimension (nx, ny, … =bins)The number of bins for all dimensions (nx=ny=…=bins). + + **range**: sequence, optionalA sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in *bins*. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values. + + **density**: bool, optionalIf False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, `bin_count / sample_count / bin_volume`. + + **weights**: (N,) array_like, optionalAn array of values *w_i* weighing each sample *(x_i, y_i, z_i, …)*. Weights are normalized to 1 if density is True. If density is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. + +- Returns: + + **H**: ndarrayThe multidimensional histogram of sample x. See density and weights for the different possible semantics. + + **edges**: listA list of D arrays describing the bin edges for each dimension. + +### 实现方法 + +先模板生成函数,底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/histograms.py#L901-L1072) + +```python +@array_function_dispatch(_histogramdd_dispatcher) +def histogramdd(sample, bins=10, range=None, density=None, weights=None): + try: + # Sample is an ND-array. + N, D = sample.shape + except (AttributeError, ValueError): + # Sample is a sequence of 1D arrays. + sample = np.atleast_2d(sample).T + N, D = sample.shape + + nbin = np.empty(D, np.intp) + edges = D*[None] + dedges = D*[None] + if weights is not None: + weights = np.asarray(weights) + + try: + M = len(bins) + if M != D: + raise ValueError( + 'The dimension of bins must be equal to the dimension of the ' + 'sample x.') + except TypeError: + # bins is an integer + bins = D*[bins] + + # normalize the range argument + if range is None: + range = (None,) * D + elif len(range) != D: + raise ValueError('range argument must have one entry per dimension') + + # Create edge arrays + for i in _range(D): + if np.ndim(bins[i]) == 0: + if bins[i] < 1: + raise ValueError( + '`bins[{}]` must be positive, when an integer'.format(i)) + smin, smax = _get_outer_edges(sample[:,i], range[i]) + try: + n = operator.index(bins[i]) + + except TypeError as e: + raise TypeError( + "`bins[{}]` must be an integer, when a scalar".format(i) + ) from e + + edges[i] = np.linspace(smin, smax, n + 1) + elif np.ndim(bins[i]) == 1: + edges[i] = np.asarray(bins[i]) + if np.any(edges[i][:-1] > edges[i][1:]): + raise ValueError( + '`bins[{}]` must be monotonically increasing, when an array' + .format(i)) + else: + raise ValueError( + '`bins[{}]` must be a scalar or 1d array'.format(i)) + + nbin[i] = len(edges[i]) + 1 # includes an outlier on each end + dedges[i] = np.diff(edges[i]) + + # Compute the bin number each sample falls into. + Ncount = tuple( + # avoid np.digitize to work around gh-11022 + np.searchsorted(edges[i], sample[:, i], side='right') + for i in _range(D) + ) + + # Using digitize, values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right edge to be + # counted in the last bin, and not as an outlier. + for i in _range(D): + # Find which points are on the rightmost edge. + on_edge = (sample[:, i] == edges[i][-1]) + # Shift these points one bin to the left. + Ncount[i][on_edge] -= 1 + + # Compute the sample indices in the flattened histogram matrix. + # This raises an error if the array is too large. + xy = np.ravel_multi_index(Ncount, nbin) + + # Compute the number of repetitions in xy and assign it to the + # flattened histmat. + hist = np.bincount(xy, weights, minlength=nbin.prod()) + + # Shape into a proper matrix + hist = hist.reshape(nbin) + + # This preserves the (bad) behavior observed in gh-7845, for now. + hist = hist.astype(float, casting='safe') + + # Remove outliers (indices 0 and -1 for each dimension). + core = D*(slice(1, -1),) + hist = hist[core] + + if density: + # calculate the probability density function + s = hist.sum() + for i in _range(D): + shape = np.ones(D, int) + shape[i] = nbin[i] - 2 + hist = hist / dedges[i].reshape(shape) + hist /= s + + if (hist.shape != nbin - 2).any(): + raise RuntimeError( + "Internal Shape Error") + return hist, edges +``` + + + +# 四、对比分析 + +PyTorch底层用cpp实现kernel,Numpy通过API在Python层直接实现。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为: + +- paddle.histogramdd(sample, bins, range=None, density=None, weights=None,name=None) + +其中 + ++ sample(Tensor) - 输入的多维 tensor ++ bins(Tensor[], int[], int) 若为`Tensor[]`,则定义了bin的边缘序列;若为`int[]`,则每个值分别定义了每个维度的等宽bin的数量;若为`int`,则定义了所有维度的等宽bin的数量。 ++ range(*sequence of python:float*):规定了bin的最左端和最右端,也就是范围。若为None则以所有输入的最小值和最大值作为边界。 ++ weight(Tensor): 默认所有输入权重为1,他的shape必须与输入sample除去最内部维度的shape相同,例如当sample的shape为[M,N]时,weight的shape必须为[M]。 + +## 底层OP设计 + +参考PyTorch与Numpy中的设计,通过组合Python API实现功能。 + +## API实现方案 + +1. 在 Paddle repo 的 python/paddle/linalg.py 文件中实现Python。 +4. 编写文档 +5. 编写单测 + +# 六、测试和验收的考量 + +测试考虑的case如下: + ++ 验证正确性,与PyTorch中的API计算结果进行对比。 ++ 修改参数,对不同参数的功能以及边界值进行测试。 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.histogramdd.html?highlight=histogramdd#torch.histogramdd) + +[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.histogramdd.html#numpy-histogramdd) \ No newline at end of file From d3770937b6c3e833ed464e83255d57f8ce54ac75 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Wed, 4 Oct 2023 23:40:47 +0800 Subject: [PATCH 14/16] add some detail --- .../20231001_api_design_for_histogramdd.md | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md index f1b483367..05bf2b087 100644 --- a/rfcs/APIs/20231001_api_design_for_histogramdd.md +++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md @@ -90,6 +90,138 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional +void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges, + const Tensor& input, const c10::optional& weight) { + TORCH_INTERNAL_ASSERT(input.dim() == 2); + + const int64_t N = input.size(0); + if (weight.has_value()) { + TORCH_INTERNAL_ASSERT(weight.value().dim() == 1 && weight.value().numel() == N); + } + + const int64_t D = input.size(1); + TORCH_INTERNAL_ASSERT(int64_t(bin_edges.size()) == D); + for (const auto dim : c10::irange(D)) { + TORCH_INTERNAL_ASSERT(bin_edges[dim].is_contiguous()); + TORCH_INTERNAL_ASSERT(hist.size(dim) + 1 == bin_edges[dim].numel()); + } + + if (D == 0) { + // hist is an empty tensor in this case; nothing to do here + return; + } + + TensorAccessor accessor_in = input.accessor(); + + /* Constructs a c10::optional containing an accessor iff + * the optional weight tensor has a value. + */ + const auto accessor_wt = weight.has_value() + ? c10::optional>(weight.value().accessor()) + : c10::optional>(); + + std::vector bin_seq(D); + std::vector num_bin_edges(D); + std::vector leftmost_edge(D), rightmost_edge(D); + + for (const auto dim : c10::irange(D)) { + bin_seq[dim] = bin_edges[dim].data_ptr(); + num_bin_edges[dim] = bin_edges[dim].numel(); + leftmost_edge[dim] = bin_seq[dim][0]; + rightmost_edge[dim] = bin_seq[dim][num_bin_edges[dim] - 1]; + } + + int64_t GRAIN_SIZE = std::max(int64_t(1), HISTOGRAM_GRAIN_SIZE / D); + + /* Parallelizes processing of input using at::parallel_for. + * Each thread accumulates a local result into their own slice of + * thread_histograms which get summed together at the end. + */ + const auto num_threads = at::get_num_threads(); + const auto hist_sizes = hist.sizes(); + DimVector thread_hist_sizes(hist_sizes.size() + 1); + thread_hist_sizes[0] = num_threads; + std::copy(hist_sizes.begin(), hist_sizes.end(), + thread_hist_sizes.begin() + 1); + Tensor thread_histograms = at::zeros(thread_hist_sizes, hist.dtype()); + TORCH_INTERNAL_ASSERT(thread_histograms.is_contiguous()); + + at::parallel_for(0, N, GRAIN_SIZE, [&](int64_t start, int64_t end) { + const auto tid = at::get_thread_num(); + auto hist_strides = thread_histograms.strides(); + input_t *hist_local_data = thread_histograms.data_ptr(); + + // View only this thread's local results + hist_local_data += hist_strides[0] * tid; + hist_strides = hist_strides.slice(1); + + for (const auto i : c10::irange(start, end)) { + bool skip_elt = false; + int64_t hist_index = 0; + + for (const auto dim : c10::irange(D)) { + const input_t elt = accessor_in[i][dim]; + + // Skips elements which fall outside the specified bins and NaN elements + if (!(elt >= leftmost_edge[dim] && elt <= rightmost_edge[dim])) { + skip_elt = true; + break; + } + + int64_t pos = -1; + + if (algorithm == BINARY_SEARCH) { + // Handles the general case via binary search on the bin edges. + pos = std::upper_bound(bin_seq[dim], bin_seq[dim] + num_bin_edges[dim], elt) + - bin_seq[dim] - 1; + } else if (algorithm == LINEAR_INTERPOLATION + || algorithm == LINEAR_INTERPOLATION_WITH_LOCAL_SEARCH) { + /* When bin_edges is known to be a linear progression, maps elt to + * the appropriate bin via simple division. + */ + pos = static_cast((elt - leftmost_edge[dim]) + * (num_bin_edges[dim] - 1) + / (rightmost_edge[dim] - leftmost_edge[dim])); + + /* Ensures consistency with bin_edges by checking the bins to the left and right + * of the selected position. Necessary for cases in which an element very close + * to a bin edge may be misclassified by simple division. + */ + if (algorithm == LINEAR_INTERPOLATION_WITH_LOCAL_SEARCH) { + int64_t pos_min = std::max(static_cast(0), pos - 1); + int64_t pos_max = std::min(pos + 2, num_bin_edges[dim]); + pos = std::upper_bound(bin_seq[dim] + pos_min, bin_seq[dim] + pos_max, elt) + - bin_seq[dim] - 1; + } + } else { + TORCH_INTERNAL_ASSERT(false); + } + + // Unlike other bins, the rightmost bin includes its right boundary + if (pos == (num_bin_edges[dim] - 1)) { + pos -= 1; + } + + hist_index += hist_strides[dim] * pos; + } + + if (!skip_elt) { + // In the unweighted case, the default weight is 1 + input_t wt = accessor_wt.has_value() ? accessor_wt.value()[i] : static_cast(1); + + hist_local_data[hist_index] += wt; + } + } + }); + + at::sum_out(hist, thread_histograms, /*dim=*/{0}); +} +``` + From 1e2562f3acd4acd360f0e69d696b7a14aa6b8c83 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Sat, 7 Oct 2023 18:49:49 +0800 Subject: [PATCH 15/16] add test path, fix args desc --- rfcs/APIs/20231001_api_design_for_histogramdd.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md index 05bf2b087..29507448e 100644 --- a/rfcs/APIs/20231001_api_design_for_histogramdd.md +++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md @@ -381,14 +381,16 @@ PyTorch底层用cpp实现kernel,Numpy通过API在Python层直接实现。 API的设计为: -- paddle.histogramdd(sample, bins, range=None, density=None, weights=None,name=None) +- paddle.histogramdd(sample, bins, range=None, density=False, weights=None,name=None) 其中 + sample(Tensor) - 输入的多维 tensor + bins(Tensor[], int[], int) 若为`Tensor[]`,则定义了bin的边缘序列;若为`int[]`,则每个值分别定义了每个维度的等宽bin的数量;若为`int`,则定义了所有维度的等宽bin的数量。 + range(*sequence of python:float*):规定了bin的最左端和最右端,也就是范围。若为None则以所有输入的最小值和最大值作为边界。 ++ density (bool) – 默认为 False , 结果将包含每个bin中的计数。如果设置为 True ,则每个计数(重量)将除以总计数,然后除以其所在bin的范围宽度。 + weight(Tensor): 默认所有输入权重为1,他的shape必须与输入sample除去最内部维度的shape相同,例如当sample的shape为[M,N]时,weight的shape必须为[M]。 ++ name(str, 可选)- 操作的名称(默认值为None)。 ## 底层OP设计 @@ -402,6 +404,8 @@ API的设计为: # 六、测试和验收的考量 +单测代码位置,Paddle repo 的 test/legacy_test/test_histogramdd_op目录 + 测试考虑的case如下: + 验证正确性,与PyTorch中的API计算结果进行对比。 From 04ff24163b4bef0a13e4e24a5f6f6b79fda5c654 Mon Sep 17 00:00:00 2001 From: coco <1228759711@qq.com> Date: Mon, 20 Nov 2023 16:21:24 +0800 Subject: [PATCH 16/16] modify params in api: sample to x, range to rangs. --- rfcs/APIs/20231001_api_design_for_histogramdd.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rfcs/APIs/20231001_api_design_for_histogramdd.md b/rfcs/APIs/20231001_api_design_for_histogramdd.md index 29507448e..52775ab12 100644 --- a/rfcs/APIs/20231001_api_design_for_histogramdd.md +++ b/rfcs/APIs/20231001_api_design_for_histogramdd.md @@ -381,13 +381,13 @@ PyTorch底层用cpp实现kernel,Numpy通过API在Python层直接实现。 API的设计为: -- paddle.histogramdd(sample, bins, range=None, density=False, weights=None,name=None) +- paddle.histogramdd(x, bins, ranges=None, density=False, weights=None,name=None) 其中 -+ sample(Tensor) - 输入的多维 tensor ++ x(Tensor) - 输入的多维 tensor + bins(Tensor[], int[], int) 若为`Tensor[]`,则定义了bin的边缘序列;若为`int[]`,则每个值分别定义了每个维度的等宽bin的数量;若为`int`,则定义了所有维度的等宽bin的数量。 -+ range(*sequence of python:float*):规定了bin的最左端和最右端,也就是范围。若为None则以所有输入的最小值和最大值作为边界。 ++ ranges(*sequence of python:float*):规定了bin的最左端和最右端,也就是范围。若为None则以所有输入的最小值和最大值作为边界。 + density (bool) – 默认为 False , 结果将包含每个bin中的计数。如果设置为 True ,则每个计数(重量)将除以总计数,然后除以其所在bin的范围宽度。 + weight(Tensor): 默认所有输入权重为1,他的shape必须与输入sample除去最内部维度的shape相同,例如当sample的shape为[M,N]时,weight的shape必须为[M]。 + name(str, 可选)- 操作的名称(默认值为None)。