Merge pull request #1463 from IntelPython/optimize-small-size-tree-re…

…duction Improve performance of reduction for small number of elements to reduce for types where tree-reduction is needed
IntelPython · Nov 4, 2023 · 9018745 · 9018745
2 parents 11ecba8 + d4d4992
commit 9018745
Show file tree

Hide file tree

Showing 12 changed files with 921 additions and 188 deletions.
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -113,10 +113,13 @@ set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
 )
+set(_boolean_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
+)
 set(_tensor_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
@@ -128,19 +131,39 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
-list(APPEND _tensor_impl_sources
-     ${_elementwise_sources}
-     ${_reduction_sources}
+set(_tensor_elementwise_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_elementwise_sources}
+)
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_boolean_reduction_sources}
+    ${_reduction_sources}
 )
 
+set(_py_trgts)
+
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_elementwise_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
@@ -170,19 +193,22 @@ if (UNIX)
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
 endif()
-target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
-target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
-if(UNIX)
-    # this option is supported on Linux only
-    target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
-endif()
-target_include_directories(${python_module_name}
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
-)
+
 set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-target_link_options(${python_module_name} PRIVATE ${_linker_options})
-add_dependencies(${python_module_name} _dpctl4pybind11_deps)
-install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
+    target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
+    if(UNIX)
+        # this option is supported on Linux only
+        target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
+    endif()
+    target_include_directories(${python_module_name}
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    add_dependencies(${python_module_name} _dpctl4pybind11_deps)
+    install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+endforeach()
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
@@ -16,6 +16,7 @@
 
 import dpctl
 import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
 import dpctl.tensor._tensor_impl as ti
 from dpctl.tensor._copy_utils import (
     _empty_like_orderK,
@@ -429,9 +430,9 @@ def clip(x, min=None, max=None, out=None, order="K"):
             "only one of `min` and `max` is permitted to be `None`"
         )
     elif max is None:
-        return _clip_none(x, min, out, order, ti._maximum)
+        return _clip_none(x, min, out, order, tei._maximum)
     elif min is None:
-        return _clip_none(x, max, out, order, ti._minimum)
+        return _clip_none(x, max, out, order, tei._minimum)
     else:
         q1, x_usm_type = x.sycl_queue, x.usm_type
         q2, min_usm_type = _get_queue_usm_type(min)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
@@ -39,6 +39,31 @@
 class UnaryElementwiseFunc:
     """
     Class that implements unary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovler_fn (callable):
+            Function that takes dtype of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        unary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values, effectively
+            evaluating `dst = func(src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
     """
 
     def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
@@ -55,8 +80,31 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the implementation function for
+        this elementwise unary function.
+
+        """
+        return self.unary_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise unary function.
+        """
+        return self.result_type_resolver_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.sin.types
+                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
+        """
         types = self.types_
         if not types:
             types = []
@@ -363,6 +411,56 @@ def _get_shape(o):
 class BinaryElementwiseFunc:
     """
     Class that implements binary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovle_fn (callable):
+            Function that takes dtypes of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        binary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src1` and `src2` are the argument arrays, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(src1, src2)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
+        binary_inplace_fn (callable, optional):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(dst, src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including async lifetime management of Python arguments,
+            while the second event corresponds to computational tasks
+            associated with function evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this binary
+            function. The function takes 6 arguments:
+                arg1_dtype - Data type of the first argument
+                arg2_dtype - Data type of the second argument
+                ret_buf1_dtype - Data type the first argument would be cast to
+                ret_buf2_dtype - Data type the second argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is only called when both arguments of the binary
+            function require casting, e.g. both arguments of
+            `dpctl.tensor.logaddexp` are arrays with integral data type.
     """
 
     def __init__(
@@ -392,8 +490,60 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the out-of-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_fn_
+
+    def get_implementation_inplace_function(self):
+        """Returns the in-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_inplace_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise binary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this binary function.
+        The function takes 6 arguments:
+            arg1_dtype - Data type of the first argument
+            arg2_dtype - Data type of the second argument
+            ret_buf1_dtype - Data type the first argument would be cast to
+            ret_buf2_dtype - Data type the second argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
+                is carried out.
+
+        The acceptance function is only invoked if both input arrays must be
+        cast to intermediary data types, as would happen during call of
+        `dpctl.tensor.hypot` with both arrays being of integral data type.
+        """
+        return self.acceptance_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.divide.types
+                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
+                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
+        """
         types = self.types_
         if not types:
             types = []

diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_elementwise_impl as ti
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import _acceptance_fn_divide

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
@@ -19,6 +19,7 @@
 import dpctl
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
 
 from ._type_utils import _to_device_supported_dtype
 
@@ -220,8 +221,8 @@ def sum(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._sum_over_axis,
-        ti._sum_over_axis_dtype_supported,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
         _default_reduction_dtype,
         _identity=0,
     )
@@ -281,8 +282,8 @@ def prod(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._prod_over_axis,
-        ti._prod_over_axis_dtype_supported,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
         _default_reduction_dtype,
         _identity=1,
     )
@@ -335,8 +336,8 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._logsumexp_over_axis,
-        lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported(
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
@@ -391,8 +392,8 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._hypot_over_axis,
-        lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported(
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
@@ -468,7 +469,7 @@ def max(x, axis=None, keepdims=False):
             entire array, a zero-dimensional array is returned. The returned
             array has the same data type as `x`.
     """
-    return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis)
 
 
 def min(x, axis=None, keepdims=False):
@@ -496,7 +497,7 @@ def min(x, axis=None, keepdims=False):
             entire array, a zero-dimensional array is returned. The returned
             array has the same data type as `x`.
     """
-    return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis)
 
 
 def _search_over_axis(x, axis, keepdims, _reduction_fn):
@@ -577,7 +578,7 @@ def argmax(x, axis=None, keepdims=False):
             zero-dimensional array is returned. The returned array has the
             default array index data type for the device of `x`.
     """
-    return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis)
+    return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis)
 
 
 def argmin(x, axis=None, keepdims=False):
@@ -609,4 +610,4 @@ def argmin(x, axis=None, keepdims=False):
             zero-dimensional array is returned. The returned array has the
             default array index data type for the device of `x`.
     """
-    return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis)
+    return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis)