Skip to content

Commit

Permalink
Merge pull request #1463 from IntelPython/optimize-small-size-tree-re…
Browse files Browse the repository at this point in the history
…duction

Improve performance of reduction for small number of elements to reduce for types where tree-reduction is needed
  • Loading branch information
oleksandr-pavlyk authored Nov 4, 2023
2 parents 11ecba8 + d4d4992 commit 9018745
Show file tree
Hide file tree
Showing 12 changed files with 921 additions and 188 deletions.
68 changes: 47 additions & 21 deletions dpctl/tensor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,13 @@ set(_reduction_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
)
set(_boolean_reduction_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
)
set(_tensor_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
Expand All @@ -128,19 +131,39 @@ set(_tensor_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
)
list(APPEND _tensor_impl_sources
${_elementwise_sources}
${_reduction_sources}
set(_tensor_elementwise_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
${_elementwise_sources}
)
set(_tensor_reductions_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
${_boolean_reduction_sources}
${_reduction_sources}
)

set(_py_trgts)

set(python_module_name _tensor_impl)
pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
list(APPEND _py_trgts ${python_module_name})

set(python_module_name _tensor_elementwise_impl)
pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
list(APPEND _py_trgts ${python_module_name})

set(python_module_name _tensor_reductions_impl)
pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
list(APPEND _py_trgts ${python_module_name})

set(_clang_prefix "")
if (WIN32)
set(_clang_prefix "/clang:")
Expand Down Expand Up @@ -170,19 +193,22 @@ if (UNIX)
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
endif()
target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
if(UNIX)
# this option is supported on Linux only
target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
endif()
target_include_directories(${python_module_name}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../include
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
)

set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
target_link_options(${python_module_name} PRIVATE ${_linker_options})
add_dependencies(${python_module_name} _dpctl4pybind11_deps)
install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
foreach(python_module_name ${_py_trgts})
target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
if(UNIX)
# this option is supported on Linux only
target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
endif()
target_include_directories(${python_module_name}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../include
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
)
target_link_options(${python_module_name} PRIVATE ${_linker_options})
add_dependencies(${python_module_name} _dpctl4pybind11_deps)
install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
endforeach()
5 changes: 3 additions & 2 deletions dpctl/tensor/_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import dpctl
import dpctl.tensor as dpt
import dpctl.tensor._tensor_elementwise_impl as tei
import dpctl.tensor._tensor_impl as ti
from dpctl.tensor._copy_utils import (
_empty_like_orderK,
Expand Down Expand Up @@ -429,9 +430,9 @@ def clip(x, min=None, max=None, out=None, order="K"):
"only one of `min` and `max` is permitted to be `None`"
)
elif max is None:
return _clip_none(x, min, out, order, ti._maximum)
return _clip_none(x, min, out, order, tei._maximum)
elif min is None:
return _clip_none(x, max, out, order, ti._minimum)
return _clip_none(x, max, out, order, tei._minimum)
else:
q1, x_usm_type = x.sycl_queue, x.usm_type
q2, min_usm_type = _get_queue_usm_type(min)
Expand Down
150 changes: 150 additions & 0 deletions dpctl/tensor/_elementwise_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,31 @@
class UnaryElementwiseFunc:
"""
Class that implements unary element-wise functions.
Args:
name (str):
Name of the unary function
result_type_resovler_fn (callable):
Function that takes dtype of the input and
returns the dtype of the result if the
implementation functions supports it, or
returns `None` otherwise.
unary_dp_impl_fn (callable):
Data-parallel implementation function with signature
`impl_fn(src: usm_ndarray, dst: usm_ndarray,
sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
where the `src` is the argument array, `dst` is the
array to be populated with function values, effectively
evaluating `dst = func(src)`.
The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
The first event corresponds to data-management host tasks,
including lifetime management of argument Python objects to ensure
that their associated USM allocation is not freed before offloaded
computational tasks complete execution, while the second event
corresponds to computational tasks associated with function
evaluation.
docs (str):
Documentation string for the unary function.
"""

def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
Expand All @@ -55,8 +80,31 @@ def __str__(self):
def __repr__(self):
return f"<{self.__name__} '{self.name_}'>"

def get_implementation_function(self):
"""Returns the implementation function for
this elementwise unary function.
"""
return self.unary_fn_

def get_type_result_resolver_function(self):
"""Returns the type resolver function for this
elementwise unary function.
"""
return self.result_type_resolver_fn_

@property
def types(self):
"""Returns information about types supported by
implementation function, using NumPy's character
encoding for data types, e.g.
:Example:
.. code-block:: python
dpctl.tensor.sin.types
# Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
"""
types = self.types_
if not types:
types = []
Expand Down Expand Up @@ -363,6 +411,56 @@ def _get_shape(o):
class BinaryElementwiseFunc:
"""
Class that implements binary element-wise functions.
Args:
name (str):
Name of the unary function
result_type_resovle_fn (callable):
Function that takes dtypes of the input and
returns the dtype of the result if the
implementation functions supports it, or
returns `None` otherwise.
binary_dp_impl_fn (callable):
Data-parallel implementation function with signature
`impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
where the `src1` and `src2` are the argument arrays, `dst` is the
array to be populated with function values,
i.e. `dst=func(src1, src2)`.
The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
The first event corresponds to data-management host tasks,
including lifetime management of argument Python objects to ensure
that their associated USM allocation is not freed before offloaded
computational tasks complete execution, while the second event
corresponds to computational tasks associated with function
evaluation.
docs (str):
Documentation string for the unary function.
binary_inplace_fn (callable, optional):
Data-parallel implementation function with signature
`impl_fn(src: usm_ndarray, dst: usm_ndarray,
sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
where the `src` is the argument array, `dst` is the
array to be populated with function values,
i.e. `dst=func(dst, src)`.
The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
The first event corresponds to data-management host tasks,
including async lifetime management of Python arguments,
while the second event corresponds to computational tasks
associated with function evaluation.
acceptance_fn (callable, optional):
Function to influence type promotion behavior of this binary
function. The function takes 6 arguments:
arg1_dtype - Data type of the first argument
arg2_dtype - Data type of the second argument
ret_buf1_dtype - Data type the first argument would be cast to
ret_buf2_dtype - Data type the second argument would be cast to
res_dtype - Data type of the output array with function values
sycl_dev - The :class:`dpctl.SyclDevice` where the function
evaluation is carried out.
The function is only called when both arguments of the binary
function require casting, e.g. both arguments of
`dpctl.tensor.logaddexp` are arrays with integral data type.
"""

def __init__(
Expand Down Expand Up @@ -392,8 +490,60 @@ def __str__(self):
def __repr__(self):
return f"<{self.__name__} '{self.name_}'>"

def get_implementation_function(self):
"""Returns the out-of-place implementation
function for this elementwise binary function.
"""
return self.binary_fn_

def get_implementation_inplace_function(self):
"""Returns the in-place implementation
function for this elementwise binary function.
"""
return self.binary_inplace_fn_

def get_type_result_resolver_function(self):
"""Returns the type resolver function for this
elementwise binary function.
"""
return self.result_type_resolver_fn_

def get_type_promotion_path_acceptance_function(self):
"""Returns the acceptance function for this
elementwise binary function.
Acceptance function influences the type promotion
behavior of this binary function.
The function takes 6 arguments:
arg1_dtype - Data type of the first argument
arg2_dtype - Data type of the second argument
ret_buf1_dtype - Data type the first argument would be cast to
ret_buf2_dtype - Data type the second argument would be cast to
res_dtype - Data type of the output array with function values
sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
is carried out.
The acceptance function is only invoked if both input arrays must be
cast to intermediary data types, as would happen during call of
`dpctl.tensor.hypot` with both arrays being of integral data type.
"""
return self.acceptance_fn_

@property
def types(self):
"""Returns information about types supported by
implementation function, using NumPy's character
encoding for data types, e.g.
:Example:
.. code-block:: python
dpctl.tensor.divide.types
# Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
# 'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
"""
types = self.types_
if not types:
types = []
Expand Down
2 changes: 1 addition & 1 deletion dpctl/tensor/_elementwise_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import dpctl.tensor._tensor_impl as ti
import dpctl.tensor._tensor_elementwise_impl as ti

from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
from ._type_utils import _acceptance_fn_divide
Expand Down
25 changes: 13 additions & 12 deletions dpctl/tensor/_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import dpctl
import dpctl.tensor as dpt
import dpctl.tensor._tensor_impl as ti
import dpctl.tensor._tensor_reductions_impl as tri

from ._type_utils import _to_device_supported_dtype

Expand Down Expand Up @@ -220,8 +221,8 @@ def sum(x, axis=None, dtype=None, keepdims=False):
axis,
dtype,
keepdims,
ti._sum_over_axis,
ti._sum_over_axis_dtype_supported,
tri._sum_over_axis,
tri._sum_over_axis_dtype_supported,
_default_reduction_dtype,
_identity=0,
)
Expand Down Expand Up @@ -281,8 +282,8 @@ def prod(x, axis=None, dtype=None, keepdims=False):
axis,
dtype,
keepdims,
ti._prod_over_axis,
ti._prod_over_axis_dtype_supported,
tri._prod_over_axis,
tri._prod_over_axis_dtype_supported,
_default_reduction_dtype,
_identity=1,
)
Expand Down Expand Up @@ -335,8 +336,8 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False):
axis,
dtype,
keepdims,
ti._logsumexp_over_axis,
lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported(
tri._logsumexp_over_axis,
lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
inp_dt, res_dt
),
_default_reduction_dtype_fp_types,
Expand Down Expand Up @@ -391,8 +392,8 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
axis,
dtype,
keepdims,
ti._hypot_over_axis,
lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported(
tri._hypot_over_axis,
lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
inp_dt, res_dt
),
_default_reduction_dtype_fp_types,
Expand Down Expand Up @@ -468,7 +469,7 @@ def max(x, axis=None, keepdims=False):
entire array, a zero-dimensional array is returned. The returned
array has the same data type as `x`.
"""
return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis)
return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis)


def min(x, axis=None, keepdims=False):
Expand Down Expand Up @@ -496,7 +497,7 @@ def min(x, axis=None, keepdims=False):
entire array, a zero-dimensional array is returned. The returned
array has the same data type as `x`.
"""
return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis)
return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis)


def _search_over_axis(x, axis, keepdims, _reduction_fn):
Expand Down Expand Up @@ -577,7 +578,7 @@ def argmax(x, axis=None, keepdims=False):
zero-dimensional array is returned. The returned array has the
default array index data type for the device of `x`.
"""
return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis)
return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis)


def argmin(x, axis=None, keepdims=False):
Expand Down Expand Up @@ -609,4 +610,4 @@ def argmin(x, axis=None, keepdims=False):
zero-dimensional array is returned. The returned array has the
default array index data type for the device of `x`.
"""
return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis)
return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis)
Loading

0 comments on commit 9018745

Please sign in to comment.