Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROTOTYPE] Add fallback to legacy scan implementation for CPU devices and devices that lack size 32 sub-groups #1749

Merged
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1eaaf7d
Checkpoint for reduce then scan integration
mmichel11 Jul 8, 2024
e7a909c
Introduce a parallel_backend_sycl_reduce_then_scan.h file to contain …
mmichel11 Jul 9, 2024
b8c08c1
Port of kernels from two-pass scan KT branch
mmichel11 Jul 10, 2024
40b5f02
Move the single-element last element storage for exclusive_scan after…
mmichel11 Jul 11, 2024
4ae5144
Use init value type for init processing helper
adamfidel Jul 11, 2024
a3034de
Lower single work-group upper limit to 2048 elements (empirically found)
mmichel11 Jul 12, 2024
83ac2c8
[PROTOTYPE] Generalized two pass algorithm and copy_if (#1700)
danhoeflinger Jul 17, 2024
e16a5af
bug fix for global race on block carry-out
danhoeflinger Jul 18, 2024
d90a348
bugfix for elements to process in partial subgroup scan
danhoeflinger Jul 18, 2024
7f2dca1
[PROTOTYPE] Add unused temporary storage to single work-group scan to…
adamfidel Jul 18, 2024
61295cf
Add temporary work-group size cap for FPGA_EMU testing
mmichel11 Jul 19, 2024
0938b5c
[PROTOTYPE] Resolve conversion issues between internal tuple and std:…
mmichel11 Jul 19, 2024
6585460
Use __dpl_sycl::__local_accessor
adamfidel Jul 22, 2024
33d816d
bugfix for overruning input for small non multiples of subgroup size
danhoeflinger Jul 22, 2024
c155b58
Check if a subgroup is active before fetching its carry and grab the …
mmichel11 Jul 23, 2024
23f7886
Comment out std::complex tests in scan_by_segment tests
mmichel11 Jul 23, 2024
bd971db
renaming __out as it seems to be a keyword
danhoeflinger Jul 24, 2024
5337317
fixing device copyable for helpers
danhoeflinger Jul 29, 2024
e60aa75
Add legacy scan fallback case
mmichel11 Jul 31, 2024
f2037cc
Clang-format
mmichel11 Jul 31, 2024
ce98569
Merge branch 'dev/shared/reduce_then_scan_impl' into dev/mmichel11/le…
mmichel11 Jul 31, 2024
8a0a7a9
Remove unneeded #endif
mmichel11 Jul 31, 2024
e768892
Remove bad code after merge and pull single-wg invocation out of unne…
mmichel11 Jul 31, 2024
7b69c66
Cleanup
mmichel11 Jul 31, 2024
e11df04
Check __target_size instead of 32 in __supports_sub_group_size
mmichel11 Jul 31, 2024
ef96d5a
::std:: -> std::
mmichel11 Jul 31, 2024
5942d73
Remove unnecessary templated _SizeType
mmichel11 Jul 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 40 additions & 11 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -894,13 +894,17 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
// Reduce-then-scan performs poorly on CPUs due to sub-group operations.
if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
{
oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
}
}
else
{
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
Expand All @@ -911,8 +915,13 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoAssign __no_assign_op;
_NoOpFunctor __get_data_op;

// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of reduce-then-scan
using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
_TempStorage __dummy_result_and_scratch{__exec, 0};

return
__parallel_transform_scan_base(
__future(__parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __binary_op, __init,
// local scan
Expand All @@ -924,7 +933,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
__binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
// global scan
unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(),
__dummy_result_and_scratch);
}
}

Expand Down Expand Up @@ -1028,7 +1038,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,

constexpr ::std::uint16_t __single_group_upper_limit = 2048;

::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);

if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
__max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
Expand All @@ -1039,9 +1050,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
_SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
::std::forward<_OutRng>(__out_rng), __pred);
}
else
// Reduce-then-scan performs poorly on CPUs due to sub-group operations.
else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
{
using _ReduceOp = ::std::plus<_Size>;
using _ReduceOp = std::plus<_Size>;

return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
Expand All @@ -1052,6 +1064,23 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{});
}
else
{
using _ReduceOp = ::std::plus<_Size>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

::std->std

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
/*inclusive*/ std::true_type, 1>;
// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of reduce-then-scan
using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Size>;
_TempStorage __dummy_result_and_scratch{__exec, 0};

return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
CreateOp{__pred}, CopyOp{})
.event(),
__dummy_result_and_scratch);
}
}

//------------------------------------------------------------------------
Expand Down
10 changes: 10 additions & 0 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <memory>
#include <type_traits>
#include <tuple>
#include <algorithm>

#include "../../iterator_impl.h"

Expand Down Expand Up @@ -769,6 +770,15 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
}
};

template <typename _ExecutionPolicy, typename _SizeType>
bool
__supports_sub_group_size(const _ExecutionPolicy& __exec, _SizeType __target_size)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the only usage of __target_size in this function casts it to std::size_t, would it make sense to take the parameter by std::size_t directly instead of using a template?

Suggested change
__supports_sub_group_size(const _ExecutionPolicy& __exec, _SizeType __target_size)
__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I've added this and removed the static_cast below.

{
const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast<std::size_t>(__target_size)) !=
__subgroup_sizes.end();
}

} // namespace __par_backend_hetero
} // namespace dpl
} // namespace oneapi
Expand Down