From 0f1e01dd4d2cad7558a38e61f20cba0d9627cb1f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:03:29 -0400 Subject: [PATCH] [PROTOTYPE] partition, unique families and ranges API (#1708) Enabling partition and unique family of scan-like algorithms includes ranges API Making legacy scan user `__result_and_scratch_storage` to match future type for return to compile Refactoring of `__pattern` / `__parallel` structure for scan-like algorithms for consistency --------- Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 80 +++-- .../hetero/algorithm_ranges_impl_hetero.h | 92 ++---- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 287 ++++++++++++------ .../parallel_backend_sycl_reduce_then_scan.h | 88 ++++-- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 54 +++- .../pstl/hetero/numeric_ranges_impl_hetero.h | 34 +-- .../device_copyable.pass.cpp | 67 +++- test/support/utils_device_copyable.h | 61 ++++ 8 files changed, 497 insertions(+), 266 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 3bdc187ce7f..300681a76da 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -885,33 +885,6 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat // copy_if //------------------------------------------------------------------------ -template -::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type> -__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last, - _IteratorOrTuple __output_first, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) -{ - using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - - if (__first == __last) - return ::std::make_pair(__output_first, _It1DifferenceType{0}); - - _It1DifferenceType __n = __last - __first; - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); - auto __buf1 = __keep1(__first, __last); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>(); - auto __buf2 = __keep2(__output_first, __output_first + __n); - - auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __create_mask_op, - __copy_by_mask_op); - - ::std::size_t __num_copied = __res.get(); - return ::std::make_pair(__output_first + __n, __num_copied); -} - template _Iterator2 @@ -951,19 +924,24 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e return ::std::make_pair(__result1, __result2); using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - using _ReduceOp = ::std::plus<_It1DifferenceType>; - unseq_backend::__create_mask<_UnaryPredicate, _It1DifferenceType> __create_mask_op{__pred}; - unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __copy_by_mask_op{_ReduceOp{}}; + _It1DifferenceType __n = __last - __first; - auto __result = __pattern_scan_copy( - __tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __par_backend_hetero::zip( - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)), - __create_mask_op, __copy_by_mask_op); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __buf1 = __keep1(__first, __last); + + auto __zipped_res = __par_backend_hetero::zip( + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)); - return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second)); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>(); + auto __buf2 = __keep2(__zipped_res, __zipped_res + __n); + + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + + return std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get())); } //------------------------------------------------------------------------ @@ -977,16 +955,28 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec _Iterator2 __result_first, _BinaryPredicate __pred) { using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, oneapi::dpl::__internal::__pstl_assign, - /*inclusive*/ ::std::true_type, 1> - __copy_by_mask_op; - __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{ - __not_pred<_BinaryPredicate>{__pred}}; - auto __result = __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __result_first, __create_mask_op, __copy_by_mask_op); + _It1DifferenceType __n = __last - __first; + + if (__n == 0) + return __result_first; + if (__n == 1) + { + oneapi::dpl::__internal::__pattern_walk2_brick( + __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first, + oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}); + return __result_first + 1; + } + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __buf1 = __keep1(__first, __last); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); + auto __buf2 = __keep2(__result_first, __result_first + __n); + + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); - return __result_first + __result.second; + return __result_first + __result.get(); } template diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index dd4dd25c7e3..5c1c09c02d5 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -334,67 +334,21 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _ // copy_if //------------------------------------------------------------------------ -template -oneapi::dpl::__internal::__difference_t<_Range1> -__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) -{ - if (__rng1.size() == 0) - return __rng1.size(); - - using _SizeType = decltype(__rng1.size()); - using _ReduceOp = ::std::plus<_SizeType>; - using _Assigner = unseq_backend::__scan_assigner; - using _NoAssign = unseq_backend::__scan_no_assign; - using _MaskAssigner = unseq_backend::__mask_assigner<1>; - using _InitType = unseq_backend::__no_init_value<_SizeType>; - using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; - - _Assigner __assign_op; - _ReduceOp __reduce_op; - _DataAcc __get_data_op; - _MaskAssigner __add_mask_op; - - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __rng1.size()); - - auto __res = - __par_backend_hetero::__parallel_transform_scan_base( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::zip_view( - __rng1, oneapi::dpl::__ranges::all_view( - __mask_buf.get_buffer())), - __rng2, __reduce_op, _InitType{}, - // local scan - unseq_backend::__scan{__reduce_op, __get_data_op, __assign_op, - __add_mask_op, __create_mask_op}, - // scan between groups - unseq_backend::__scan{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, - __get_data_op}, - // global scan - __copy_by_mask_op) - .get(); - - return __res; -} - template oneapi::dpl::__internal::__difference_t<_Range2> __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Predicate __pred, _Assign) + _Predicate __pred, _Assign&& __assign) { - using _SizeType = decltype(__rng1.size()); - using _ReduceOp = ::std::plus<_SizeType>; + auto __n = __rng1.size(); + if (__n == 0) + return 0; - unseq_backend::__create_mask<_Predicate, _SizeType> __create_mask_op{__pred}; - unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op; + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __pred, std::forward<_Assign>(__assign)); - return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - __create_mask_op, __copy_by_mask_op); + return __res.get(); //is a blocking call } //------------------------------------------------------------------------ @@ -433,17 +387,27 @@ template oneapi::dpl::__internal::__difference_t<_Range2> __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, - _BinaryPredicate __pred, _Assign) + _BinaryPredicate __pred, _Assign&& __assign) { - using _It1DifferenceType = oneapi::dpl::__internal::__difference_t<_Range1>; - unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, _Assign, /*inclusive*/ ::std::true_type, 1> - __copy_by_mask_op; - __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{ - __not_pred<_BinaryPredicate>{__pred}}; - - return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng), ::std::forward<_Range2>(__result), - __create_mask_op, __copy_by_mask_op); + auto __n = __rng.size(); + if (__n == 0) + return 0; + if (__n == 1) + { + using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>; + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng), + std::forward<_Range2>(__result)) + .get(); + + return 1; + } + + return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign)) + .get(); } //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index faa373d42ff..00fda4cc07f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -512,8 +512,8 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem } } - __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, - __lacc_ptr, __bin_op, __init); + __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, __lacc_ptr, + __bin_op, __init); if constexpr (__can_use_subgroup_load_store) { @@ -555,10 +555,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W __internal::__optional_kernel_name<_ScanKernelName...>> { template + typename _UnaryOp, typename _Assign> auto operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, - _BinaryOperation __bin_op, _UnaryOp __unary_op) + _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign) { using _ValueType = ::std::uint16_t; @@ -617,12 +617,13 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W __scan_work_group<_ValueType, /* _Inclusive */ false>( __group, __lacc_ptr, __lacc_ptr + __elems_per_wg, __lacc_ptr + __elems_per_wg, __bin_op, - __init); + __init); for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize) { if (__lacc[__idx]) - __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]); + __assign(static_cast<__tuple_type>(__in_rng[__idx]), + __out_rng[__lacc[__idx + __elems_per_wg]]); } const ::std::uint16_t __residual = __n % _WGSize; @@ -631,7 +632,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W { auto __idx = __residual_start + __item_id; if (__lacc[__idx]) - __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]); + __assign(static_cast<__tuple_type>(__in_rng[__idx]), + __out_rng[__lacc[__idx + __elems_per_wg]]); } if (__item_id == 0) @@ -774,7 +776,8 @@ struct __gen_transform_input operator()(InRng&& __in_rng, std::size_t __idx) const { using _ValueType = oneapi::dpl::__internal::__value_t; - using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t::type>; + using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t< + typename std::invoke_result<_UnaryOp, _ValueType>::type>; return _OutValueType{__unary_op(__in_rng[__idx])}; } _UnaryOp __unary_op; @@ -796,19 +799,44 @@ struct __simple_write_to_idx }; template -struct __gen_count_pred +struct __gen_mask +{ + template + bool + operator()(_InRng&& __in_rng, std::size_t __idx) const + { + return __pred(__in_rng[__idx]); + } + _Predicate __pred; +}; + +template +struct __gen_unique_mask +{ + template + bool + operator()(_InRng&& __in_rng, std::size_t __idx) const + { + //starting index is offset to 1 for "unique" patterns and 0th element copy is handled separately + return !__pred(__in_rng[__idx], __in_rng[__idx - 1]); + } + _BinaryPredicate __pred; +}; + +template +struct __gen_count_mask { template _SizeType operator()(_InRng&& __in_rng, _SizeType __idx) const { - return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0}; + return __gen_mask(std::forward<_InRng>(__in_rng), __idx) ? _SizeType{1} : _SizeType{0}; } - _Predicate __pred; + _GenMask __gen_mask; }; -template -struct __gen_expand_count_pred +template +struct __gen_expand_count_mask { template auto @@ -820,10 +848,10 @@ struct __gen_expand_count_pred using _ElementType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t>; _ElementType ele = __in_rng[__idx]; - bool mask = __pred(ele); + bool mask = __gen_mask(__in_rng, __idx); return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele); } - _Predicate __pred; + _GenMask __gen_mask; }; struct __get_zeroth_element @@ -835,7 +863,7 @@ struct __get_zeroth_element return std::get<0>(std::forward<_Tp>(__a)); } }; - +template struct __write_to_idx_if { template @@ -848,8 +876,27 @@ struct __write_to_idx_if typename oneapi::dpl::__internal::__get_tuple_type(__v))>, std::decay_t>::__type; if (std::get<1>(__v)) - __out_rng[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v)); + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1 + __offset]); } + Assign __assign; +}; + +template +struct __write_to_idx_if_else +{ + template + void + operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const + { + using _ConvertedTupleType = + typename oneapi::dpl::__internal::__get_tuple_type(__v))>, + std::decay_t>::__type; + if (std::get<1>(__v)) + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out[std::get<0>(__v) - 1])); + else + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<1>(__out[__idx - std::get<0>(__v)])); + } + Assign __assign; }; template (__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); - // Reduce-then-scan performs poorly on CPUs due to sub-group operations. - if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { - oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op}; + using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; + using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_idx; + + _GenInput __gen_transform{__unary_op}; + return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}); + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, + /*_IsUniquePattern=*/std::false_type{}); } } { @@ -903,26 +954,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoAssign __no_assign_op; _NoOpFunctor __get_data_op; - // Although we do not actually need result storage in this case, we need to construct - // a placeholder here to match the return type of reduce-then-scan - using _TempStorage = __result_and_scratch_storage, _Type>; - _TempStorage __dummy_result_and_scratch{__exec, 0}; - - return - __future(__parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __binary_op, __init, - // local scan - unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, - _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, - __assign_op, __assign_op, __get_data_op}, - // scan between groups - unseq_backend::__scan>{ - __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, - // global scan - unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(), - __dummy_result_and_scratch); + return __parallel_transform_scan_base( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __binary_op, __init, + // local scan + unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, + _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, + __assign_op, __get_data_op}, + // scan between groups + unseq_backend::__scan>{ + __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, + // global scan + unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}); } } @@ -932,9 +976,11 @@ struct __invoke_single_group_copy_if // Specialization for devices that have a max work-group size of at least 1024 static constexpr ::std::uint16_t __targeted_wg_size = 1024; - template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred> + template auto - operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred) + operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred, + _Assign&& __assign) { constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size); constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size); @@ -946,26 +992,43 @@ struct __invoke_single_group_copy_if if (__is_full_group) return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, true, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::true_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __scan_copy_single_wg_kernel, + std::integral_constant, + /* _IsFullGroup= */ std::true_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); else return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, false, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::false_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __scan_copy_single_wg_kernel, + std::integral_constant, + /* _IsFullGroup= */ std::false_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); } }; +template +auto +__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, + _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) +{ + using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>; + using _ReduceOp = std::plus<_Size>; + using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>; + using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; + + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask}, + _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, + /*_Inclusive=*/std::true_type{}, __is_unique_pattern); +} + template auto @@ -973,7 +1036,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) { - using _ReduceOp = ::std::plus<_Size>; + using _ReduceOp = std::plus<_Size>; using _Assigner = unseq_backend::__scan_assigner; using _NoAssign = unseq_backend::__scan_no_assign; using _MaskAssigner = unseq_backend::__mask_assigner<1>; @@ -990,7 +1053,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag return __parallel_transform_scan_base( __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view( + oneapi::dpl::__ranges::zip_view( ::std::forward<_InRng>(__in_rng), oneapi::dpl::__ranges::all_view( __mask_buf.get_buffer())), @@ -1007,10 +1070,72 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag __copy_by_mask_op); } -template +template +auto +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred, + _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{}) +{ + + auto __n = __rng.size(); + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) + { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1, _Assign>; + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)}, + /*_IsUniquePattern=*/std::true_type{}); + } + else + { + unseq_backend::__copy_by_mask, oneapi::dpl::__internal::__pstl_assign, + /*inclusive*/ std::true_type, 1> + __copy_by_mask_op; + oneapi::dpl::__internal::__create_mask_unique_copy, + decltype(__n)> + __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op); + } +} + +template +auto +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) +{ + auto __n = __rng.size(); + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) + { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; + using _WriteOp = + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else; + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); + } + else + { + using _ReduceOp = std::plus; + + unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred}; + unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type> __partition_by_mask{_ReduceOp{}}; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask); + } +} + +template auto __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred) + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign&& __assign = _Assign{}) { using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; @@ -1027,7 +1152,6 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, constexpr ::std::uint16_t __single_group_upper_limit = 2048; std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size && __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size) @@ -1035,22 +1159,17 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>; return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng), - ::std::forward<_OutRng>(__out_rng), __pred); + _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign)); } - // Reduce-then-scan performs poorly on CPUs due to sub-group operations. - else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + else if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { - using _ReduceOp = std::plus<_Size>; + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>; - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Pred>{__pred}, - _ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Pred>{__pred}, - oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if{}, - oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, - /*_Inclusive=*/std::true_type{}); + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _GenMask{__pred}, _WriteOp{}, /*Unique=*/std::false_type{}); } else { @@ -1058,16 +1177,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using CreateOp = unseq_backend::__create_mask<_Pred, _Size>; using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 1>; - // Although we do not actually need result storage in this case, we need to construct - // a placeholder here to match the return type of reduce-then-scan - using _TempStorage = __result_and_scratch_storage, _Size>; - _TempStorage __dummy_result_and_scratch{__exec, 0}; - - return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - CreateOp{__pred}, CopyOp{}) - .event(), - __dummy_result_and_scratch); + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + CreateOp{__pred}, CopyOp{}); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index d216e8e36ac..8ac40cfa91a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -266,13 +266,15 @@ template class __reduce_then_scan_scan_kernel; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType, + typename _KernelName> struct __parallel_reduce_then_scan_reduce_submitter; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType, + typename... _KernelName> struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, - _GenReduceInput, _ReduceOp, _InitType, + __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory @@ -302,7 +304,11 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry; std::size_t __group_start_idx = (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local); - + if constexpr (__is_unique_pattern_v) + { + // for unique patterns, the first element is always copied to the output, so we need to skip it + __group_start_idx += 1; + } std::size_t __elements_in_group = std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group)); std::uint32_t __active_subgroups = @@ -400,16 +406,16 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu }; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, + typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename _KernelName> struct __parallel_reduce_then_scan_scan_submitter; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, + typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename... _KernelName> struct __parallel_reduce_then_scan_scan_submitter< - __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput, - _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> + __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, + _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> { template @@ -456,6 +462,11 @@ struct __parallel_reduce_then_scan_scan_submitter< auto __group_start_idx = (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local); + if constexpr (__is_unique_pattern_v) + { + // for unique patterns, the first element is always copied to the output, so we need to skip it + __group_start_idx += 1; + } std::size_t __elements_in_group = std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group)); @@ -609,8 +620,17 @@ struct __parallel_reduce_then_scan_scan_submitter< oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op); __sub_group_carry.__setup(__value); } - else + else // zeroth block, group and subgroup { + if constexpr (__is_unique_pattern_v) + { + if (__sub_group_local_id == 0) + { + // For unique patterns, always copy the 0th element to the output + __write_op.__assign(__in_rng[0], __out_rng[0]); + } + } + if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>) { @@ -673,7 +693,14 @@ struct __parallel_reduce_then_scan_scan_submitter< { if (__block_num + 1 == __num_blocks) { - __res_ptr[0] = __sub_group_carry.__v; + if constexpr (__is_unique_pattern_v) + { + __res_ptr[0] = __sub_group_carry.__v + 1; + } + else + { + __res_ptr[0] = __sub_group_carry.__v; + } } else { @@ -681,7 +708,6 @@ struct __parallel_reduce_then_scan_scan_submitter< __set_block_carry_out(__block_num, __tmp_ptr, __sub_group_carry.__v); } } - __sub_group_carry.__destroy(); }); }); @@ -702,6 +728,16 @@ struct __parallel_reduce_then_scan_scan_submitter< _InitType __init; }; +// reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup +// operations. We do not want to run this can on CPU targets, as they are not performant with this algorithm. +template +bool +__prefer_reduce_then_scan(const _ExecutionPolicy& __exec) +{ + const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); + return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32); +} + // General scan-like algorithm helpers // _GenReduceInput - a function which accepts the input range and index to generate the data needed by the main output // used in the reduction operation (to calculate the global carries) @@ -714,13 +750,13 @@ struct __parallel_reduce_then_scan_scan_submitter< // and performs the final write to output operation template + typename _Inclusive, typename _IsUniquePattern> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, - _ScanInputTransform __scan_input_transform, _WriteOp __write_op, - _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive) + _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, + _Inclusive, _IsUniquePattern) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -733,6 +769,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // Empirically determined maximum. May be less for non-full blocks. constexpr std::uint8_t __max_inputs_per_item = 128; constexpr bool __inclusive = _Inclusive::value; + constexpr bool __is_unique_pattern_v = _IsUniquePattern::value; // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough // that no check is needed? @@ -747,14 +784,22 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::size_t __n = __in_rng.size(); const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __num_remaining = __n; + if constexpr (__is_unique_pattern_v) + { + // skip scan of zeroth element in unique patterns + __num_remaining -= 1; + } + // reduce_then_scan kernel is not built to handle "empty" scans which includes `__n == 1` for unique patterns. + // These trivial end cases should be handled at a higher level. + assert(__num_remaining > 0); auto __inputs_per_sub_group = - __n >= __max_inputs_per_block + __num_remaining >= __max_inputs_per_block ? __max_inputs_per_block / __num_sub_groups_global : std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global); auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size; - const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block; - const auto __num_blocks = __n / __block_size + (__n % __block_size != 0); + const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block; + const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0); //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the // block carry-out. We need two for the block carry-out to prevent a race condition between reading and writing @@ -764,10 +809,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // Reduce and scan step implementations using _ReduceSubmitter = - __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, + __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>; using _ScanSubmitter = - __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, + __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, _ScanKernel>; // TODO: remove below before merging. used for convenience now @@ -797,6 +842,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, __inputs_per_sub_group, __inputs_per_item, __b); + if (__num_remaining > __block_size) { // Resize for the next block. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index a2bfef10fb0..9a935152446 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -237,10 +237,22 @@ template struct __gen_transform_input; template -struct __gen_count_pred; +struct __gen_mask; -template -struct __gen_expand_count_pred; +template +struct __gen_unique_mask; + +template +struct __gen_count_mask; + +template +struct __gen_expand_count_mask; + +template +struct __write_to_idx_if; + +template +struct __write_to_idx_if_else; template struct __early_exit_find_or; @@ -255,15 +267,41 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_pred, _Predicate)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)> : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate> { }; -template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred, - _Predicate)> - : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate> +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, + _BinaryPredicate)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPredicate> +{ +}; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)> + : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask, + _GenMask)> + : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset, + Assign)> + : oneapi::dpl::__internal::__are_all_device_copyable +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else, + Assign)> + : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h index 969b05ab914..55596ee6473 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h @@ -91,35 +91,15 @@ oneapi::dpl::__internal::__difference_t<_Range2> __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { - if (__rng1.empty()) + auto __n = __rng1.size(); + if (__n == 0) return 0; - oneapi::dpl::__internal::__difference_t<_Range2> __rng1_size = __rng1.size(); - - using _Type = typename _InitType::__value_type; - using _Assigner = unseq_backend::__scan_assigner; - using _NoAssign = unseq_backend::__scan_no_assign; - using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>; - using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; - - _Assigner __assign_op; - _NoAssign __no_assign_op; - _NoOpFunctor __get_data_op; - - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_base( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), - ::std::forward<_Range2>(__rng2), __binary_op, __init, - // local scan - unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, - _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op, - __get_data_op}, - // scan between groups - unseq_backend::__scan>{ - __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, - // global scan - unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}) + + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __unary_op, __init, __binary_op, _Inclusive{}) .wait(); - return __rng1_size; + return __n; } template >, "__gen_transform_input is not device copyable with device copyable types"); - //__gen_count_pred - static_assert(sycl::is_device_copyable_v>, - "__gen_count_pred is not device copyable with device copyable types"); + //__gen_mask + static_assert(sycl::is_device_copyable_v>, + "__gen_mask is not device copyable with device copyable types"); - //__gen_expand_count_pred + //__gen_unique_mask static_assert( - sycl::is_device_copyable_v>, - "__gen_expand_count_pred is not device copyable with device copyable types"); + sycl::is_device_copyable_v>, + "__gen_unique_mask is not device copyable with device copyable types"); + + //__gen_count_mask + static_assert(sycl::is_device_copyable_v>>, + "__gen_count_mask is not device copyable with device copyable types"); + + //__gen_expand_count_mask + static_assert(sycl::is_device_copyable_v>>, + "__gen_expand_count_mask is not device copyable with device copyable types"); + + //__write_to_idx_if + static_assert( + sycl::is_device_copyable_v>, + "__write_to_idx_if is not device copyable with device copyable types"); + + //__write_to_idx_if_else + static_assert( + sycl::is_device_copyable_v>, + "__write_to_idx_if_else is not device copyable with device copyable types"); // __early_exit_find_or static_assert( @@ -357,20 +377,39 @@ test_non_device_copyable() oneapi::dpl::unseq_backend::__brick_reduce_idx>, "__brick_reduce_idx is device copyable with non device copyable types"); - // //__gen_transform_input + //__gen_transform_input static_assert( !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); - //__gen_count_pred - static_assert( - !sycl::is_device_copyable_v>, - "__gen_count_pred is device copyable with non device copyable types"); + //__gen_mask + static_assert(!sycl::is_device_copyable_v>, + "__gen_mask is device copyable with non device copyable types"); + + //__gen_unique_mask + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_unique_mask>, + "__gen_unique_mask is device copyable with non device copyable types"); + + //__gen_count_mask + static_assert(!sycl::is_device_copyable_v>>, + "__gen_count_mask is device copyable with non device copyable types"); + + //__gen_expand_count_mask + static_assert(!sycl::is_device_copyable_v>>, + "__gen_expand_count_mask is device copyable with non device copyable types"); + + //__write_to_idx_if + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_non_device_copyable>>, + "__write_to_idx_if is device copyable with non device copyable types"); - //__gen_expand_count_pred + //__write_to_idx_if_else static_assert(!sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred>, - "__gen_expand_count_pred is device copyable with non device copyable types"); + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else>, + "__write_to_idx_if_else is device copyable with non device copyable types"); // __early_exit_find_or static_assert( diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h index 7b98de6501c..32e02991933 100644 --- a/test/support/utils_device_copyable.h +++ b/test/support/utils_device_copyable.h @@ -48,6 +48,57 @@ struct noop_non_device_copyable } }; +// Device copyable assignment callable. +// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not +// relying on trivial copyability +struct assign_non_device_copyable +{ + assign_non_device_copyable(const assign_non_device_copyable& other) { std::cout << "non trivial copy ctor\n"; } + template + void + operator()(const _Xp& __x, _Yp&& __y) const + { + std::forward<_Yp>(__y) = __x; + } +}; + +struct assign_device_copyable +{ + assign_device_copyable(const assign_device_copyable& other) { std::cout << "non trivial copy ctor\n"; } + template + void + operator()(const _Xp& __x, _Yp&& __y) const + { + std::forward<_Yp>(__y) = __x; + } +}; + +// Device copyable binary operator binary operators. +// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not +// relying on trivial copyability +struct binary_op_non_device_copyable +{ + binary_op_non_device_copyable(const binary_op_non_device_copyable& other) + { + std::cout << " non trivial copy ctor\n"; + } + int + operator()(int a, int b) const + { + return a; + } +}; + +struct binary_op_device_copyable +{ + binary_op_device_copyable(const binary_op_device_copyable& other) { std::cout << " non trivial copy ctor\n"; } + int + operator()(int a, int b) const + { + return a; + } +}; + // Device copyable int wrapper struct used in testing as surrogate for values, value types, etc. // Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not // relying on trivial copyability @@ -160,6 +211,16 @@ struct sycl::is_device_copyable : std::true_typ { }; +template <> +struct sycl::is_device_copyable : std::true_type +{ +}; + +template <> +struct sycl::is_device_copyable : std::true_type +{ +}; + template <> struct sycl::is_device_copyable : std::true_type {