Skip to content

Commit

Permalink
[PROTOTYPE] Add fallback to legacy scan implementation for CPU device…
Browse files Browse the repository at this point in the history
…s and devices that lack size 32 sub-groups (#1749)


Signed-off-by: Matthew Michel <matthew.michel@intel.com>
Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
Co-authored-by: Adam Fidel <adam.fidel@intel.com>
Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Co-authored-by: Adam Fidel <110841220+adamfidel@users.noreply.github.com>
Co-authored-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
  • Loading branch information
5 people committed Aug 8, 2024
1 parent 38cffa7 commit a177af0
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 11 deletions.
51 changes: 40 additions & 11 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -882,13 +882,17 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
// Reduce-then-scan performs poorly on CPUs due to sub-group operations.
if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
{
oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
}
}
else
{
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
Expand All @@ -899,8 +903,13 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoAssign __no_assign_op;
_NoOpFunctor __get_data_op;

// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of reduce-then-scan
using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
_TempStorage __dummy_result_and_scratch{__exec, 0};

return
__parallel_transform_scan_base(
__future(__parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __binary_op, __init,
// local scan
Expand All @@ -912,7 +921,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
__binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
// global scan
unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(),
__dummy_result_and_scratch);
}
}

Expand Down Expand Up @@ -1021,7 +1031,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,

constexpr ::std::uint16_t __single_group_upper_limit = 2048;

::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);

if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
__max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
Expand All @@ -1032,9 +1043,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
_SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
::std::forward<_OutRng>(__out_rng), __pred);
}
else
// Reduce-then-scan performs poorly on CPUs due to sub-group operations.
else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
{
using _ReduceOp = ::std::plus<_Size>;
using _ReduceOp = std::plus<_Size>;

return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
Expand All @@ -1045,6 +1057,23 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{});
}
else
{
using _ReduceOp = std::plus<_Size>;
using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
/*inclusive*/ std::true_type, 1>;
// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of reduce-then-scan
using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Size>;
_TempStorage __dummy_result_and_scratch{__exec, 0};

return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
CreateOp{__pred}, CopyOp{})
.event(),
__dummy_result_and_scratch);
}
}

//------------------------------------------------------------------------
Expand Down
10 changes: 10 additions & 0 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <memory>
#include <type_traits>
#include <tuple>
#include <algorithm>

#include "../../iterator_impl.h"

Expand Down Expand Up @@ -772,6 +773,15 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
}
};

template <typename _ExecutionPolicy>
bool
__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
{
const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) !=
__subgroup_sizes.end();
}

} // namespace __par_backend_hetero
} // namespace dpl
} // namespace oneapi
Expand Down

0 comments on commit a177af0

Please sign in to comment.