diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 9c02a325bc..d1de208805 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -113,10 +113,13 @@ set(_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp ) +set(_boolean_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp +) set(_tensor_impl_sources - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp @@ -128,19 +131,39 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) -list(APPEND _tensor_impl_sources - ${_elementwise_sources} - ${_reduction_sources} +set(_tensor_elementwise_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_elementwise_sources} +) +set(_tensor_reductions_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_boolean_reduction_sources} + ${_reduction_sources} ) +set(_py_trgts) + set(python_module_name _tensor_impl) pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_elementwise_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_reductions_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") @@ -170,19 +193,22 @@ if (UNIX) ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES") endif() -target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) -target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) -if(UNIX) - # this option is supported on Linux only - target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) -endif() -target_include_directories(${python_module_name} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ -) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") -target_link_options(${python_module_name} PRIVATE ${_linker_options}) -add_dependencies(${python_module_name} _dpctl4pybind11_deps) -install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +foreach(python_module_name ${_py_trgts}) + target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) + if(UNIX) + # this option is supported on Linux only + target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) + endif() + target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + add_dependencies(${python_module_name} _dpctl4pybind11_deps) + install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +endforeach() diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py index 5a3a96933f..eeed87b404 100644 --- a/dpctl/tensor/_clip.py +++ b/dpctl/tensor/_clip.py @@ -16,6 +16,7 @@ import dpctl import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei import dpctl.tensor._tensor_impl as ti from dpctl.tensor._copy_utils import ( _empty_like_orderK, @@ -429,9 +430,9 @@ def clip(x, min=None, max=None, out=None, order="K"): "only one of `min` and `max` is permitted to be `None`" ) elif max is None: - return _clip_none(x, min, out, order, ti._maximum) + return _clip_none(x, min, out, order, tei._maximum) elif min is None: - return _clip_none(x, max, out, order, ti._minimum) + return _clip_none(x, max, out, order, tei._minimum) else: q1, x_usm_type = x.sycl_queue, x.usm_type q2, min_usm_type = _get_queue_usm_type(min) diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index baaac078b5..4a0d1c451f 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -39,6 +39,31 @@ class UnaryElementwiseFunc: """ Class that implements unary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovler_fn (callable): + Function that takes dtype of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + unary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, effectively + evaluating `dst = func(src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. """ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs): @@ -55,8 +80,31 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the implementation function for + this elementwise unary function. + + """ + return self.unary_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise unary function. + """ + return self.result_type_resolver_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.sin.types + # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D'] + """ types = self.types_ if not types: types = [] @@ -363,6 +411,56 @@ def _get_shape(o): class BinaryElementwiseFunc: """ Class that implements binary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovle_fn (callable): + Function that takes dtypes of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + binary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src1` and `src2` are the argument arrays, `dst` is the + array to be populated with function values, + i.e. `dst=func(src1, src2)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. + binary_inplace_fn (callable, optional): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, + i.e. `dst=func(dst, src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including async lifetime management of Python arguments, + while the second event corresponds to computational tasks + associated with function evaluation. + acceptance_fn (callable, optional): + Function to influence type promotion behavior of this binary + function. The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is only called when both arguments of the binary + function require casting, e.g. both arguments of + `dpctl.tensor.logaddexp` are arrays with integral data type. """ def __init__( @@ -392,8 +490,60 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the out-of-place implementation + function for this elementwise binary function. + + """ + return self.binary_fn_ + + def get_implementation_inplace_function(self): + """Returns the in-place implementation + function for this elementwise binary function. + + """ + return self.binary_inplace_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise binary function. + """ + return self.result_type_resolver_fn_ + + def get_type_promotion_path_acceptance_function(self): + """Returns the acceptance function for this + elementwise binary function. + + Acceptance function influences the type promotion + behavior of this binary function. + The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation + is carried out. + + The acceptance function is only invoked if both input arrays must be + cast to intermediary data types, as would happen during call of + `dpctl.tensor.hypot` with both arrays being of integral data type. + """ + return self.acceptance_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.divide.types + # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D', + # 'Ff->F', 'FF->F', 'Dd->D', 'DD->D'] + """ types = self.types_ if not types: types = [] diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index aa5ba04b19..9879960999 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_elementwise_impl as ti from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc from ._type_utils import _acceptance_fn_divide diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 0edc9ac12b..0cd302cccc 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -19,6 +19,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri from ._type_utils import _to_device_supported_dtype @@ -220,8 +221,8 @@ def sum(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._sum_over_axis, - ti._sum_over_axis_dtype_supported, + tri._sum_over_axis, + tri._sum_over_axis_dtype_supported, _default_reduction_dtype, _identity=0, ) @@ -281,8 +282,8 @@ def prod(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._prod_over_axis, - ti._prod_over_axis_dtype_supported, + tri._prod_over_axis, + tri._prod_over_axis_dtype_supported, _default_reduction_dtype, _identity=1, ) @@ -335,8 +336,8 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._logsumexp_over_axis, - lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported( + tri._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported( inp_dt, res_dt ), _default_reduction_dtype_fp_types, @@ -391,8 +392,8 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._hypot_over_axis, - lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported( + tri._hypot_over_axis, + lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported( inp_dt, res_dt ), _default_reduction_dtype_fp_types, @@ -468,7 +469,7 @@ def max(x, axis=None, keepdims=False): entire array, a zero-dimensional array is returned. The returned array has the same data type as `x`. """ - return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis) + return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis) def min(x, axis=None, keepdims=False): @@ -496,7 +497,7 @@ def min(x, axis=None, keepdims=False): entire array, a zero-dimensional array is returned. The returned array has the same data type as `x`. """ - return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis) + return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis) def _search_over_axis(x, axis, keepdims, _reduction_fn): @@ -577,7 +578,7 @@ def argmax(x, axis=None, keepdims=False): zero-dimensional array is returned. The returned array has the default array index data type for the device of `x`. """ - return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis) + return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis) def argmin(x, axis=None, keepdims=False): @@ -609,4 +610,4 @@ def argmin(x, axis=None, keepdims=False): zero-dimensional array is returned. The returned array has the default array index data type for the device of `x`. """ - return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis) + return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis) diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 500c997e8f..69a1a200df 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -3,6 +3,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri def _boolean_reduction(x, axis, keepdims, func): @@ -94,7 +95,7 @@ def all(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical AND reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._all) + return _boolean_reduction(x, axis, keepdims, tri._all) def any(x, axis=None, keepdims=False): @@ -122,4 +123,4 @@ def any(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical OR reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._any) + return _boolean_reduction(x, axis, keepdims, tri._any) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 40e5bd282d..6651483c6c 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -477,11 +477,11 @@ sycl::event reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - constexpr size_t preferrered_reductions_per_wi = 4; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -619,11 +619,11 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( result_indexer}; ReductionIndexerT reduction_indexer{}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - { + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + else { sycl::event res_init_ev = exec_q.fill( res_tp, resTy(identity_val), iter_nelems, depends); @@ -718,11 +752,11 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -1090,15 +1124,44 @@ sycl::event reduction_over_group_temps_strided_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 4; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1113,7 +1176,10 @@ sycl::event reduction_over_group_temps_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1164,13 +1230,13 @@ sycl::event reduction_over_group_temps_strided_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1227,7 +1293,7 @@ sycl::event reduction_over_group_temps_strided_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1244,7 +1310,7 @@ sycl::event reduction_over_group_temps_strided_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1255,11 +1321,10 @@ sycl::event reduction_over_group_temps_strided_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1302,7 +1367,7 @@ sycl::event reduction_over_group_temps_strided_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1319,7 +1384,7 @@ sycl::event reduction_over_group_temps_strided_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -1440,15 +1505,47 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1466,7 +1563,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1518,13 +1618,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1575,7 +1675,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1592,7 +1692,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1603,11 +1703,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1650,7 +1749,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1667,7 +1766,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -1784,15 +1883,51 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1811,7 +1946,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1863,13 +2001,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1920,7 +2058,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1937,7 +2075,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1948,11 +2086,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1995,7 +2132,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -2012,7 +2149,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -2110,11 +2247,10 @@ template struct TypePairSupportDataForCompReductionAtomic { - /* value if true a kernel for must be instantiated, false + /* value is true if a kernel for must be instantiated, false * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< // input int32 td_ns::TypePairDefinedEntry, // input uint32 @@ -2123,6 +2259,10 @@ struct TypePairSupportDataForCompReductionAtomic td_ns::TypePairDefinedEntry, // input uint64 td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, // fall-through td_ns::NotDefinedEntry>::is_defined; }; @@ -2131,19 +2271,17 @@ template struct TypePairSupportDataForCompReductionTemps { - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< + // input bool td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, - // input uint8_t td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, - // input uint16_t td_ns::TypePairDefinedEntry, @@ -3263,6 +3401,129 @@ struct LogSumExpOverAxis0TempsContigFactory // Argmax and Argmin +/* Sequential search reduction */ + +template +struct SequentialSearchReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + +public: + SequentialSearchReduction(const argT *inp, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), idx_reduction_op_(idx_reduction_op), + idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const py::ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const py::ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + argT red_val(identity_); + outT idx_val(idx_identity_); + for (size_t m = 0; m < reduction_max_gid_; ++m) { + const py::ssize_t inp_reduction_offset = + inp_reduced_dims_indexer_(m); + const py::ssize_t inp_offset = + inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == red_val) { + idx_val = idx_reduction_op_(idx_val, static_cast(m)); + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val < red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val > red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + } + } + out_[out_iter_offset] = idx_val; + } +}; + /* = Search reduction using reduce_over_group*/ template ) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { if (val < local_red_val || std::isnan(val)) { local_red_val = val; if constexpr (!First) { @@ -3576,7 +3839,9 @@ struct CustomSearchReduction } } } - else if constexpr (std::is_floating_point_v) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { if (val > local_red_val || std::isnan(val)) { local_red_val = val; if constexpr (!First) { @@ -3619,7 +3884,9 @@ struct CustomSearchReduction ? local_idx : idx_identity_; } - else if constexpr (std::is_floating_point_v) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { // equality does not hold for NaNs, so check here local_idx = (red_val_over_wg == local_red_val || std::isnan(local_red_val)) @@ -3661,6 +3928,14 @@ typedef sycl::event (*search_strided_impl_fn_ptr)( py::ssize_t, const std::vector &); +template +class search_seq_strided_krn; + template class custom_search_over_group_temps_strided_krn; +template +class search_seq_contig_krn; + template (); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 4; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 4; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -3904,7 +4218,10 @@ sycl::event search_over_group_temps_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -3956,13 +4273,13 @@ sycl::event search_over_group_temps_strided_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4031,7 +4348,7 @@ sycl::event search_over_group_temps_strided_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4050,7 +4367,7 @@ sycl::event search_over_group_temps_strided_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4065,11 +4382,10 @@ sycl::event search_over_group_temps_strided_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4114,7 +4430,7 @@ sycl::event search_over_group_temps_strided_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4135,7 +4451,7 @@ sycl::event search_over_group_temps_strided_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -4278,15 +4594,49 @@ sycl::event search_axis1_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -4304,7 +4654,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -4356,13 +4709,13 @@ sycl::event search_axis1_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4425,7 +4778,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4444,7 +4797,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4459,11 +4812,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4508,7 +4860,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4529,7 +4881,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -4657,15 +5009,53 @@ sycl::event search_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class search_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -4684,7 +5074,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -4736,13 +5129,13 @@ sycl::event search_axis0_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4806,7 +5199,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4825,7 +5218,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4840,11 +5233,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4889,7 +5281,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4910,7 +5302,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp similarity index 97% rename from dpctl/tensor/libtensor/source/tensor_py.cpp rename to dpctl/tensor/libtensor/source/tensor_ctors.cpp index d07d5cf084..4720f6baa1 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -1,4 +1,5 @@ -//===-- tensor_py.cpp - Implementation of _tensor_impl module --*-C++-*-/===// +//===-- tensor_ctors.cpp - ---*-C++-*-/===// +// Implementation of _tensor_impl module // // Data Parallel Control (dpctl) // @@ -36,19 +37,16 @@ #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" -#include "boolean_reductions.hpp" #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" -#include "elementwise_functions/elementwise_common.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" -#include "reductions/reduction_common.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" @@ -454,8 +452,4 @@ PYBIND11_MODULE(_tensor_impl, m) "Returns a tuple of events: (hev, ev)", py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - - dpctl::tensor::py_internal::init_elementwise_functions(m); - dpctl::tensor::py_internal::init_boolean_reduction_functions(m); - dpctl::tensor::py_internal::init_reduction_functions(m); } diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp new file mode 100644 index 0000000000..1a86526893 --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp @@ -0,0 +1,34 @@ +//===-- tensor_elementwise.cpp ---*-C++-*-/===// +// Implementation of _tensor_elementwise_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "elementwise_functions/elementwise_common.hpp" +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_elementwise_impl, m) +{ + dpctl::tensor::py_internal::init_elementwise_functions(m); +} diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp new file mode 100644 index 0000000000..138c31f3eb --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_reductions.cpp @@ -0,0 +1,37 @@ +//===-- tensor_reductions.cpp - --*-C++-*-/===// +// Implementation of _tensor_reductions_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "boolean_reductions.hpp" +#include "reductions/reduction_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_reductions_impl, m) +{ + dpctl::tensor::py_internal::init_boolean_reduction_functions(m); + dpctl::tensor::py_internal::init_reduction_functions(m); +} diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py new file mode 100644 index 0000000000..b7f1d26d6e --- /dev/null +++ b/dpctl/tests/elementwise/test_elementwise_classes.py @@ -0,0 +1,80 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl.tensor as dpt + +unary_fn = dpt.negative +binary_fn = dpt.divide + + +def test_unary_class_getters(): + fn = unary_fn.get_implementation_function() + assert callable(fn) + + fn = unary_fn.get_type_result_resolver_function() + assert callable(fn) + + +def test_unary_class_types_property(): + loop_types = unary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_unary_class_str_repr(): + s = str(unary_fn) + r = repr(unary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = unary_fn.__name__ + assert kl_n in s + assert kl_n in r + + +def test_binary_class_getters(): + fn = binary_fn.get_implementation_function() + assert callable(fn) + + fn = binary_fn.get_implementation_inplace_function() + assert callable(fn) + + fn = binary_fn.get_type_result_resolver_function() + assert callable(fn) + + fn = binary_fn.get_type_promotion_path_acceptance_function() + assert callable(fn) + + +def test_binary_class_types_property(): + loop_types = binary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_binary_class_str_repr(): + s = str(binary_fn) + r = repr(binary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = binary_fn.__name__ + assert kl_n in s + assert kl_n in r diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py index f4b1416784..8ae49fd60d 100644 --- a/examples/python/sycl_timer.py +++ b/examples/python/sycl_timer.py @@ -15,14 +15,27 @@ # limitations under the License. -import dpnp import numpy as np import dpctl import dpctl.tensor as dpt from dpctl import SyclTimer -n = 4000 + +def matmul(m1, m2): + """Naive matrix multiplication implementation""" + assert m1.ndim == 2 + assert m2.ndim == 2 + assert m1.shape[1] == m2.shape[0] + m1 = m1[:, dpt.newaxis, :] + m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :] + # form m_prod[i, j, k] = m1[i,k] * m2[k, j] + m_prods = m1 * m2 + # sum over k + return dpt.sum(m_prods, axis=-1) + + +n = 500 try: q = dpctl.SyclQueue(property="enable_profiling") @@ -33,32 +46,36 @@ ) exit(0) -a = dpt.reshape(dpt.arange(n * n, dtype=np.float32, sycl_queue=q), (n, n)) -b = dpt.reshape( - dpt.asarray(np.random.random(n * n), dtype=np.float32, sycl_queue=q), (n, n) -) +a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q) +a = dpt.reshape(a_flat, (n, n)) -timer = SyclTimer(time_scale=1) +b_rand = np.random.random(n * n).astype(np.float32) +b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q) +b = dpt.reshape(b_flat, (n, n)) wall_times = [] device_times = [] + print( - f"Performing matrix multiplication of two {n} by {n} matrices " + f"Computing naive matrix multiplication of two {n} by {n} matrices " f"on {q.sycl_device.name}, repeating 5 times." ) +print() for _ in range(5): + timer = SyclTimer(time_scale=1) with timer(q): - a_matmul_b = dpnp.matmul(a, b) + a_matmul_b = matmul(a, b) host_time, device_time = timer.dt wall_times.append(host_time) device_times.append(device_time) -c = dpnp.asnumpy(a_matmul_b) -cc = np.dot(dpnp.asnumpy(a), dpnp.asnumpy(b)) +c = dpt.asnumpy(a_matmul_b) +cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b)) print("Wall time: ", wall_times, "\nDevice time: ", device_times) +print() print( "Accuracy test: passed." if np.allclose(c, cc) - else (f"Accuracy test: failed. Discrepancy {np.max(np.abs(c-cc))}") + else (f"Accuracy test: FAILED. \n Discrepancy = {np.max(np.abs(c-cc))}") )