Implement kernels for in-place pow, remainder, and bitwise op…

…erators (#1447) * Implements dedicated __ipow__ kernel * Implements in-place remainder * Implements in-place bitwise_and and bitwise_or * Implements in-place bitwise_xor * Implements in-place bitwise_left_shift and bitwise_right_shift * Adds tests for in-place bitwise elementwise funcs * Added tests for in-place remainder and pow Fixed in-place remainder for devices that do not support 64-bit floating point data types * Test commit splitting up elementwise functions * Added missing includes of common_inplace * Split elementwise functions into two more files and added them to the build * Fix more missing includes * Splits elementwise functions into separate source files * Corrected numbers of elementwise functions * Added missing vector include to elementwise function source files Removed utility include * Remove variable name in function declaration * No need to import init functions into namespace, since they are defined in it Removed "using dpctl::tensor::py_internal::init_abs`, since this imports `init_abs` into the current namespace from `dpctl::tensor::py_internal`, but this namespace is the current namespace and so the import is a no-op. Also added brief docstring for the common init module. * Changed use of "static inline" for utility functions Instead, moved common functions into anonymous namespace as inline, which is C++ way of expressing that multiple definitions of the same function may exist in different C++ translation units, which linker unifies. * Moved inline functions into separate translation units Instead of using inline keyword to allow multiple definitions of the same function in different translation units, introduced elementwise_functions_type_utils.cpp that defines these functions and a header file to use in other translatioon units. This should reduce the binary size of the produced object files and simplify the linker's job reducing the link-time. * Added license header for 2 new files --------- Co-authored-by: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
IntelPython · Oct 25, 2023 · 442e46f · 442e46f
1 parent 386bd8b
commit 442e46f
Show file tree

Hide file tree

Showing 163 changed files with 14,358 additions and 5,201 deletions.
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -30,6 +30,78 @@ if(WIN32)
     endif()
 endif()
 
+set(_elementwise_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
+)
 set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
@@ -47,10 +119,12 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
 )
+list(APPEND _tensor_impl_sources
+     ${_elementwise_sources}
+)
 
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
@@ -63,9 +137,11 @@ endif()
 set(_no_fast_math_sources
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
 )
+list(APPEND _no_fast_math_sources
+     ${_elementwise_sources}
+)
 foreach(_src_fn ${_no_fast_math_sources})
   get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
   set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
@@ -76,7 +152,8 @@ foreach(_src_fn ${_no_fast_math_sources})
 endforeach()
 if (UNIX)
   set_source_files_properties(
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
 endif()
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)

diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
@@ -297,6 +297,7 @@
     ti._bitwise_and_result_type,
     ti._bitwise_and,
     _bitwise_and_docstring_,
+    binary_inplace_fn=ti._bitwise_and_inplace,
 )
 
 # B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
@@ -330,6 +331,7 @@
     ti._bitwise_left_shift_result_type,
     ti._bitwise_left_shift,
     _bitwise_left_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_left_shift_inplace,
 )
 
 
@@ -393,6 +395,7 @@
     ti._bitwise_or_result_type,
     ti._bitwise_or,
     _bitwise_or_docstring_,
+    binary_inplace_fn=ti._bitwise_or_inplace,
 )
 
 # B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
@@ -425,6 +428,7 @@
     ti._bitwise_right_shift_result_type,
     ti._bitwise_right_shift,
     _bitwise_right_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_right_shift_inplace,
 )
 
 
@@ -459,6 +463,7 @@
     ti._bitwise_xor_result_type,
     ti._bitwise_xor,
     _bitwise_xor_docstring_,
+    binary_inplace_fn=ti._bitwise_xor_inplace,
 )
 
 
@@ -1178,7 +1183,7 @@
     _logical_xor_docstring_,
 )
 
-# B??: ==== MAXIMUM    (x1, x2)
+# B26: ==== MAXIMUM    (x1, x2)
 _maximum_docstring_ = """
 maximum(x1, x2, out=None, order='K')
 
@@ -1208,7 +1213,7 @@
     _maximum_docstring_,
 )
 
-# B??: ==== MINIMUM    (x1, x2)
+# B27: ==== MINIMUM    (x1, x2)
 _minimum_docstring_ = """
 minimum(x1, x2, out=None, order='K')
 
@@ -1266,7 +1271,7 @@
     ti._multiply_result_type,
     ti._multiply,
     _multiply_docstring_,
-    ti._multiply_inplace,
+    binary_inplace_fn=ti._multiply_inplace,
 )
 
 # U25: ==== NEGATIVE    (x)
@@ -1361,10 +1366,14 @@
         the returned array is determined by the Type Promotion Rules.
 """
 pow = BinaryElementwiseFunc(
-    "pow", ti._pow_result_type, ti._pow, _pow_docstring_
+    "pow",
+    ti._pow_result_type,
+    ti._pow,
+    _pow_docstring_,
+    binary_inplace_fn=ti._pow_inplace,
 )
 
-# U??: ==== PROJ        (x)
+# U40: ==== PROJ        (x)
 _proj_docstring = """
 proj(x, out=None, order='K')
 
@@ -1443,7 +1452,11 @@
         the returned array is determined by the Type Promotion Rules.
 """
 remainder = BinaryElementwiseFunc(
-    "remainder", ti._remainder_result_type, ti._remainder, _remainder_docstring_
+    "remainder",
+    ti._remainder_result_type,
+    ti._remainder,
+    _remainder_docstring_,
+    binary_inplace_fn=ti._remainder_inplace,
 )
 
 # U28: ==== ROUND       (x)
@@ -1501,7 +1514,7 @@
     "sign", ti._sign_result_type, ti._sign, _sign_docstring
 )
 
-# ==== SIGNBIT        (x)
+# U41: ==== SIGNBIT        (x)
 _signbit_docstring = """
 signbit(x, out=None, order='K')
 
@@ -1654,7 +1667,7 @@
     ti._subtract_result_type,
     ti._subtract,
     _subtract_docstring_,
-    ti._subtract_inplace,
+    binary_inplace_fn=ti._subtract_inplace,
 )
 
 

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -33,6 +33,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -257,6 +258,144 @@ struct BitwiseAndStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res && in;
+        }
+        else {
+            res &= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res && in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res &= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseAndInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseAndInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseAndInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseAndInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_and_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
+                                size_t nelems,
+                                const char *arg_p,
+                                py::ssize_t arg_offset,
+                                char *res_p,
+                                py::ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseAndInplaceContigFunctor,
+        bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                           res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_and_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseAndInplaceStridedFunctor,
+        bitwise_and_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_and
 } // namespace kernels
 } // namespace tensor

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -35,6 +35,8 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
+#include "kernels/elementwise_functions/common.hpp"
+
 namespace dpctl
 {
 namespace tensor