Merge master into gold/2021 (#1335)

* Add dpnp.broadcast_to() function (#1333) * Add support of bool type in bitwise operations (#1334) * Add support of bool type in bitwise operations * Update dpnp/dpnp_algo/dpnp_algo_bitwise.pyx
IntelPython · Mar 7, 2023 · 759c6e7 · 759c6e7
1 parent b675679
commit 759c6e7
Show file tree

Hide file tree

Showing 17 changed files with 602 additions and 215 deletions.
diff --git a/dpnp/backend/include/dpnp_gen_2arg_1type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_1type_tbl.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2016-2020, Intel Corporation
+// Copyright (c) 2016-2023, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -104,7 +104,7 @@
 
 #endif
 
-MACRO_2ARG_1TYPE_OP(dpnp_bitwise_and_c, input1_elem& input2_elem)
+MACRO_2ARG_1TYPE_OP(dpnp_bitwise_and_c, input1_elem & input2_elem)
 MACRO_2ARG_1TYPE_OP(dpnp_bitwise_or_c, input1_elem | input2_elem)
 MACRO_2ARG_1TYPE_OP(dpnp_bitwise_xor_c, input1_elem ^ input2_elem)
 MACRO_2ARG_1TYPE_OP(dpnp_left_shift_c, input1_elem << input2_elem)

diff --git a/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp b/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp
@@ -27,6 +27,7 @@
 
 #include "dpnp_fptr.hpp"
 #include "dpnp_iface.hpp"
+#include "dpnp_iterator.hpp"
 #include "dpnp_utils.hpp"
 #include "dpnpc_memory_adapter.hpp"
 #include "queue_sycl.hpp"
@@ -49,27 +50,66 @@ DPCTLSyclEventRef dpnp_invert_c(DPCTLSyclQueueRef q_ref,
     sycl::queue q = *(reinterpret_cast<sycl::queue*>(q_ref));
     sycl::event event;
 
-    DPNPC_ptr_adapter<_DataType> input1_ptr(q_ref, array1_in, size);
-    _DataType* array1 = input1_ptr.get_ptr();
-    _DataType* result = reinterpret_cast<_DataType*>(result1);
+    _DataType* input_data = static_cast<_DataType*>(array1_in);
+    _DataType* result = static_cast<_DataType*>(result1);
 
-    sycl::range<1> gws(size);
-    auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {
-        size_t i = global_id[0]; /*for (size_t i = 0; i < size; ++i)*/
+    constexpr size_t lws = 64;
+    constexpr unsigned int vec_sz = 8;
+
+    auto gws_range = sycl::range<1>(((size + lws * vec_sz - 1) / (lws * vec_sz)) * lws);
+    auto lws_range = sycl::range<1>(lws);
+
+    auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
+        auto sg = nd_it.get_sub_group();
+        const auto max_sg_size = sg.get_max_local_range()[0];
+        const size_t start =
+            vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + sg.get_group_id()[0] * max_sg_size);
+
+        if (start + static_cast<size_t>(vec_sz) * max_sg_size < size)
         {
-            _DataType input_elem1 = array1[i];
-            result[i] = ~input_elem1;
+            using multi_ptrT = sycl::multi_ptr<_DataType, sycl::access::address_space::global_space>;
+
+            sycl::vec<_DataType, vec_sz> x = sg.load<vec_sz>(multi_ptrT(&input_data[start]));
+            sycl::vec<_DataType, vec_sz> res_vec;
+
+            if constexpr (std::is_same_v<_DataType, bool>)
+            {
+#pragma unroll
+                for (size_t k = 0; k < vec_sz; ++k)
+                {
+                    res_vec[k] = !(x[k]);
+                }
+            }
+            else
+            {
+                res_vec = ~x;
+            }
+
+            sg.store<vec_sz>(multi_ptrT(&result[start]), res_vec);
+        }
+        else
+        {
+            for (size_t k = start + sg.get_local_id()[0]; k < size; k += max_sg_size)
+            {
+                if constexpr (std::is_same_v<_DataType, bool>)
+                {
+                    result[k] = !(input_data[k]);
+                }
+                else
+                {
+                    result[k] = ~(input_data[k]);
+                }
+            }
         }
     };
 
     auto kernel_func = [&](sycl::handler& cgh) {
-        cgh.parallel_for<class dpnp_invert_c_kernel<_DataType>>(gws, kernel_parallel_for_func);
+        cgh.parallel_for<class dpnp_invert_c_kernel<_DataType>>(sycl::nd_range<1>(gws_range, lws_range),
+                                                                kernel_parallel_for_func);
     };
-
     event = q.submit(kernel_func);
 
     event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
-
     return DPCTLEvent_Copy(event_ref);
 }
 
@@ -84,6 +124,7 @@ void dpnp_invert_c(void* array1_in, void* result1, size_t size)
                                                            size,
                                                            dep_event_vec_ref);
     DPCTLEvent_WaitAndThrow(event_ref);
+    DPCTLEvent_Delete(event_ref);
 }
 
 template <typename _DataType>
@@ -98,9 +139,11 @@ DPCTLSyclEventRef (*dpnp_invert_ext_c)(DPCTLSyclQueueRef,
 
 static void func_map_init_bitwise_1arg_1type(func_map_t& fmap)
 {
+    fmap[DPNPFuncName::DPNP_FN_INVERT][eft_BLN][eft_BLN] = {eft_BLN, (void*)dpnp_invert_default_c<bool>};
     fmap[DPNPFuncName::DPNP_FN_INVERT][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_invert_default_c<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_INVERT][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_invert_default_c<int64_t>};
 
+    fmap[DPNPFuncName::DPNP_FN_INVERT_EXT][eft_BLN][eft_BLN] = {eft_BLN, (void*)dpnp_invert_ext_c<bool>};
     fmap[DPNPFuncName::DPNP_FN_INVERT_EXT][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_invert_ext_c<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_INVERT_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_invert_ext_c<int64_t>};
 
@@ -114,6 +157,9 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap)
     template <typename _KernelNameSpecialization>                                                                      \
     class __name__##_strides_kernel;                                                                                   \
                                                                                                                        \
+    template <typename _KernelNameSpecialization>                                                                      \
+    class __name__##_broadcast_kernel;                                                                                 \
+                                                                                                                       \
     template <typename _DataType>                                                                                      \
     DPCTLSyclEventRef __name__(DPCTLSyclQueueRef q_ref,                                                                \
                                void* result_out,                                                                       \
@@ -152,6 +198,8 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap)
         _DataType* input2_data = static_cast<_DataType*>(const_cast<void*>(input2_in));                                \
         _DataType* result = static_cast<_DataType*>(result_out);                                                       \
                                                                                                                        \
+        bool use_broadcasting = !array_equal(input1_shape, input1_ndim, input2_shape, input2_ndim);                    \
+                                                                                                                       \
         shape_elem_type* input1_shape_offsets = new shape_elem_type[input1_ndim];                                      \
                                                                                                                        \
         get_shape_offsets_inkernel(input1_shape, input1_ndim, input1_shape_offsets);                                   \
@@ -167,7 +215,42 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap)
         sycl::event event;                                                                                             \
         sycl::range<1> gws(result_size);                                                                               \
                                                                                                                        \
-        if (use_strides)                                                                                               \
+        if (use_broadcasting)                                                                                          \
+        {                                                                                                              \
+            DPNPC_id<_DataType>* input1_it;                                                                            \
+            const size_t input1_it_size_in_bytes = sizeof(DPNPC_id<_DataType>);                                        \
+            input1_it = reinterpret_cast<DPNPC_id<_DataType>*>(dpnp_memory_alloc_c(q_ref, input1_it_size_in_bytes));   \
+            new (input1_it) DPNPC_id<_DataType>(q_ref, input1_data, input1_shape, input1_strides, input1_ndim);        \
+                                                                                                                       \
+            input1_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
+                                                                                                                       \
+            DPNPC_id<_DataType>* input2_it;                                                                            \
+            const size_t input2_it_size_in_bytes = sizeof(DPNPC_id<_DataType>);                                        \
+            input2_it = reinterpret_cast<DPNPC_id<_DataType>*>(dpnp_memory_alloc_c(q_ref, input2_it_size_in_bytes));   \
+            new (input2_it) DPNPC_id<_DataType>(q_ref, input2_data, input2_shape, input2_strides, input2_ndim);        \
+                                                                                                                       \
+            input2_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
+                                                                                                                       \
+            auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                               \
+                const size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                          \
+                {                                                                                                      \
+                    const _DataType input1_elem = (*input1_it)[i];                                                     \
+                    const _DataType input2_elem = (*input2_it)[i];                                                     \
+                    result[i] = __operation__;                                                                         \
+                }                                                                                                      \
+            };                                                                                                         \
+            auto kernel_func = [&](sycl::handler& cgh) {                                                               \
+                cgh.parallel_for<class __name__##_broadcast_kernel<_DataType>>(gws, kernel_parallel_for_func);         \
+            };                                                                                                         \
+                                                                                                                       \
+            q.submit(kernel_func).wait();                                                                              \
+                                                                                                                       \
+            input1_it->~DPNPC_id();                                                                                    \
+            input2_it->~DPNPC_id();                                                                                    \
+                                                                                                                       \
+            return event_ref;                                                                                          \
+        }                                                                                                              \
+        else if (use_strides)                                                                                          \
         {                                                                                                              \
             if ((result_ndim != input1_ndim) || (result_ndim != input2_ndim))                                          \
             {                                                                                                          \
@@ -332,18 +415,21 @@ static void func_map_init_bitwise_2arg_1type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_BITWISE_AND][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_and_c_default<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_AND][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_and_c_default<int64_t>};
 
+    fmap[DPNPFuncName::DPNP_FN_BITWISE_AND_EXT][eft_BLN][eft_BLN] = {eft_BLN, (void*)dpnp_bitwise_and_c_ext<bool>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_AND_EXT][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_and_c_ext<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_AND_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_and_c_ext<int64_t>};
 
     fmap[DPNPFuncName::DPNP_FN_BITWISE_OR][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_or_c_default<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_OR][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_or_c_default<int64_t>};
 
+    fmap[DPNPFuncName::DPNP_FN_BITWISE_OR_EXT][eft_BLN][eft_BLN] = {eft_BLN, (void*)dpnp_bitwise_or_c_ext<bool>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_OR_EXT][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_or_c_ext<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_OR_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_or_c_ext<int64_t>};
 
     fmap[DPNPFuncName::DPNP_FN_BITWISE_XOR][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_xor_c_default<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_XOR][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_xor_c_default<int64_t>};
 
+    fmap[DPNPFuncName::DPNP_FN_BITWISE_XOR_EXT][eft_BLN][eft_BLN] = {eft_BLN, (void*)dpnp_bitwise_xor_c_ext<bool>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_XOR_EXT][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_bitwise_xor_c_ext<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_BITWISE_XOR_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_bitwise_xor_c_ext<int64_t>};
 

diff --git a/dpnp/dpnp_algo/dpnp_algo_bitwise.pyx b/dpnp/dpnp_algo/dpnp_algo_bitwise.pyx
@@ -1,7 +1,7 @@
 # cython: language_level=3
 # -*- coding: utf-8 -*-
 # *****************************************************************************
-# Copyright (c) 2016-2020, Intel Corporation
+# Copyright (c) 2016-2023, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -68,8 +68,8 @@ cpdef utils.dpnp_descriptor dpnp_bitwise_xor(utils.dpnp_descriptor x1_obj,
     return call_fptr_2in_1out_strides(DPNP_FN_BITWISE_XOR_EXT, x1_obj, x2_obj, dtype=dtype, out=out, where=where)
 
 
-cpdef utils.dpnp_descriptor dpnp_invert(utils.dpnp_descriptor arr):
-    return call_fptr_1in_1out(DPNP_FN_INVERT_EXT, arr, arr.shape)
+cpdef utils.dpnp_descriptor dpnp_invert(utils.dpnp_descriptor arr, utils.dpnp_descriptor out=None):
+    return call_fptr_1in_1out(DPNP_FN_INVERT_EXT, arr, arr.shape, out=out, func_name="invert")
 
 
 cpdef utils.dpnp_descriptor dpnp_left_shift(utils.dpnp_descriptor x1_obj,

diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
@@ -125,7 +125,9 @@ def __abs__(self):
     def __add__(self, other):
         return dpnp.add(self, other)
 
- # '__and__',
+    def __and__(self, other):
+        return dpnp.bitwise_and(self, other)
+
  # '__array__',
  # '__array_finalize__',
  # '__array_function__',
@@ -193,9 +195,17 @@ def __gt__(self, other):
 
  # '__hash__',
  # '__iadd__',
- # '__iand__',
+
+    def __iand__(self, other):
+        dpnp.bitwise_and(self, other, out=self)
+        return self
+
  # '__ifloordiv__',
- # '__ilshift__',
+
+    def __ilshift__(self, other):
+        dpnp.left_shift(self, other, out=self)
+        return self
+
  # '__imatmul__',
  # '__imod__',
  # '__imul__',
@@ -209,18 +219,28 @@ def __index__(self):
     def __int__(self):
         return self._array_obj.__int__()
 
- # '__invert__',
- # '__ior__',
+    def __invert__(self):
+        return dpnp.invert(self)
+
+    def __ior__(self, other):
+        dpnp.bitwise_or(self, other, out=self)
+        return self
 
     def __ipow__(self, other):
         dpnp.power(self, other, out=self)
         return self
 
- # '__irshift__',
+    def __irshift__(self, other):
+        dpnp.right_shift(self, other, out=self)
+        return self
+
  # '__isub__',
  # '__iter__',
  # '__itruediv__',
- # '__ixor__',
+
+    def __ixor__(self, other):
+        dpnp.bitwise_xor(self, other, out=self)
+        return self
 
     def __le__(self, other):
         return dpnp.less_equal(self, other)
@@ -232,7 +252,8 @@ def __len__(self):
 
         return self._array_obj.__len__()
 
- # '__lshift__',
+    def __lshift__(self, other):
+        return dpnp.left_shift(self, other)
 
     def __lt__(self, other):
         return dpnp.less(self, other)
@@ -253,7 +274,10 @@ def __neg__(self):
         return dpnp.negative(self)
 
  # '__new__',
- # '__or__',
+
+    def __or__(self, other):
+        return dpnp.bitwise_or(self, other)
+
  # '__pos__',
 
     def __pow__(self, other):
@@ -262,7 +286,9 @@ def __pow__(self, other):
     def __radd__(self, other):
         return dpnp.add(other, self)
 
- # '__rand__',
+    def __rand__(self, other):
+        return dpnp.bitwise_and(other, self)
+
  # '__rdivmod__',
  # '__reduce__',
  # '__reduce_ex__',
@@ -271,7 +297,9 @@ def __repr__(self):
         return dpt.usm_ndarray_repr(self._array_obj, prefix="array")
 
  # '__rfloordiv__',
- # '__rlshift__',
+
+    def __rlshift__(self, other):
+        return dpnp.left_shift(other, self)
 
     def __rmatmul__(self, other):
         return dpnp.matmul(other, self)
@@ -282,21 +310,27 @@ def __rmod__(self, other):
     def __rmul__(self, other):
         return dpnp.multiply(other, self)
 
- # '__ror__',
-
+    def __ror__(self, other):
+        return dpnp.bitwise_or(other, self)
+
     def __rpow__(self, other):
         return dpnp.power(other, self)
 
- # '__rrshift__',
- # '__rshift__',
+    def __rrshift__(self, other):
+        return dpnp.right_shift(other, self)
+
+    def __rshift__(self, other):
+        return dpnp.right_shift(self, other)
 
     def __rsub__(self, other):
         return dpnp.subtract(other, self)
 
     def __rtruediv__(self, other):
         return dpnp.true_divide(other, self)
 
- # '__rxor__',
+    def __rxor__(self, other):
+        return dpnp.bitwise_xor(other, self)
+
  # '__setattr__',
 
     def __setitem__(self, key, val):
@@ -334,7 +368,8 @@ def __sub__(self, other):
     def __truediv__(self, other):
         return dpnp.true_divide(self, other)
 
- # '__xor__',
+    def __xor__(self, other):
+        return dpnp.bitwise_xor(self, other)
 
     @staticmethod
     def _create_from_usm_ndarray(usm_ary : dpt.usm_ndarray):

diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
@@ -251,6 +251,7 @@ def from_dlpack(obj, /):
 def get_dpnp_descriptor(ext_obj,
                         copy_when_strides=True,
                         copy_when_nondefault_queue=True,
+                        alloc_dtype=None,
                         alloc_usm_type=None,
                         alloc_queue=None):
     """
@@ -274,7 +275,7 @@ def get_dpnp_descriptor(ext_obj,
     # If input object is a scalar, it means it was allocated on host memory.
     # We need to copy it to USM memory according to compute follows data paradigm.
     if isscalar(ext_obj):
-        ext_obj = array(ext_obj, usm_type=alloc_usm_type, sycl_queue=alloc_queue)
+        ext_obj = array(ext_obj, dtype=alloc_dtype, usm_type=alloc_usm_type, sycl_queue=alloc_queue)
 
     # while dpnp functions have no implementation with strides support
     # we need to create a non-strided copy