From 8dcf2e0a42b8e50b6c81564fce6fc15646c868e5 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 17 Jul 2024 09:37:28 -0400 Subject: [PATCH 01/15] partition + unique patterns; ranges API Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 28 ++- .../hetero/algorithm_ranges_impl_hetero.h | 81 +++------ .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 172 ++++++++++-------- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 50 ++++- .../pstl/hetero/numeric_ranges_impl_hetero.h | 37 +--- .../device_copyable.pass.cpp | 68 +++++-- .../alg.modifying.operations/unique.pass.cpp | 4 +- test/support/utils_device_copyable.h | 58 ++++++ 8 files changed, 300 insertions(+), 198 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 3bdc187ce7f..b361405306b 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -886,10 +886,10 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat //------------------------------------------------------------------------ template + typename _GenMask, typename _WriteOp> ::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type> __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last, - _IteratorOrTuple __output_first, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) + _IteratorOrTuple __output_first, _GenMask __gen_mask, _WriteOp __write_op) { using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; @@ -904,9 +904,9 @@ __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Itera oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>(); auto __buf2 = __keep2(__output_first, __output_first + __n); - auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __create_mask_op, - __copy_by_mask_op); + auto __res = + __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __buf1.all_view(), __buf2.all_view(), __n, __gen_mask, __write_op); ::std::size_t __num_copied = __res.get(); return ::std::make_pair(__output_first + __n, __num_copied); @@ -951,17 +951,14 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e return ::std::make_pair(__result1, __result2); using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - using _ReduceOp = ::std::plus<_It1DifferenceType>; - - unseq_backend::__create_mask<_UnaryPredicate, _It1DifferenceType> __create_mask_op{__pred}; - unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __copy_by_mask_op{_ReduceOp{}}; auto __result = __pattern_scan_copy( __tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __par_backend_hetero::zip( __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)), - __create_mask_op, __copy_by_mask_op); + oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}); return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second)); } @@ -977,14 +974,11 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec _Iterator2 __result_first, _BinaryPredicate __pred) { using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, oneapi::dpl::__internal::__pstl_assign, - /*inclusive*/ ::std::true_type, 1> - __copy_by_mask_op; - __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{ - __not_pred<_BinaryPredicate>{__pred}}; - auto __result = __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __result_first, __create_mask_op, __copy_by_mask_op); + auto __result = + __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first, + oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if{}); return __result_first + __result.second; } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index dd4dd25c7e3..c5f688441fa 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -334,67 +334,37 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _ // copy_if //------------------------------------------------------------------------ -template +template oneapi::dpl::__internal::__difference_t<_Range1> __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) + _GenMask __gen_mask, _WriteOp __write_op) { - if (__rng1.size() == 0) - return __rng1.size(); - - using _SizeType = decltype(__rng1.size()); - using _ReduceOp = ::std::plus<_SizeType>; - using _Assigner = unseq_backend::__scan_assigner; - using _NoAssign = unseq_backend::__scan_no_assign; - using _MaskAssigner = unseq_backend::__mask_assigner<1>; - using _InitType = unseq_backend::__no_init_value<_SizeType>; - using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; - - _Assigner __assign_op; - _ReduceOp __reduce_op; - _DataAcc __get_data_op; - _MaskAssigner __add_mask_op; - - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __rng1.size()); - - auto __res = - __par_backend_hetero::__parallel_transform_scan_base( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::zip_view( - __rng1, oneapi::dpl::__ranges::all_view( - __mask_buf.get_buffer())), - __rng2, __reduce_op, _InitType{}, - // local scan - unseq_backend::__scan{__reduce_op, __get_data_op, __assign_op, - __add_mask_op, __create_mask_op}, - // scan between groups - unseq_backend::__scan{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, - __get_data_op}, - // global scan - __copy_by_mask_op) - .get(); + auto __n = __rng1.size(); + if (__n == 0) + return 0; - return __res; + auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __gen_mask, __write_op); + return __res.get(); } template oneapi::dpl::__internal::__difference_t<_Range2> __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Predicate __pred, _Assign) + _Predicate __pred, _Assign&& __assign) { - using _SizeType = decltype(__rng1.size()); - using _ReduceOp = ::std::plus<_SizeType>; + auto __n = __rng1.size(); + if (__n == 0) + return 0; - unseq_backend::__create_mask<_Predicate, _SizeType> __create_mask_op{__pred}; - unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op; + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __pred, std::forward<_Assign>(__assign)); - return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - __create_mask_op, __copy_by_mask_op); + return __res.get(); //is a blocking call } //------------------------------------------------------------------------ @@ -433,17 +403,12 @@ template oneapi::dpl::__internal::__difference_t<_Range2> __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, - _BinaryPredicate __pred, _Assign) + _BinaryPredicate __pred, _Assign&& __assign) { - using _It1DifferenceType = oneapi::dpl::__internal::__difference_t<_Range1>; - unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, _Assign, /*inclusive*/ ::std::true_type, 1> - __copy_by_mask_op; - __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{ - __not_pred<_BinaryPredicate>{__pred}}; - - return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng), ::std::forward<_Range2>(__result), - __create_mask_op, __copy_by_mask_op); + return __pattern_scan_copy(__tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), + oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if{std::forward<_Assign>(__assign)}); } //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index fac34509872..c861afc852d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -566,10 +566,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W __internal::__optional_kernel_name<_ScanKernelName...>> { template + typename _UnaryOp, typename _Assign> auto operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, - _BinaryOperation __bin_op, _UnaryOp __unary_op) + _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign) { using _ValueType = ::std::uint16_t; @@ -629,12 +629,13 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W __scan_work_group<_ValueType, /* _Inclusive */ false>( __group, __lacc_ptr, __lacc_ptr + __elems_per_wg, __lacc_ptr + __elems_per_wg, __bin_op, - __init); + __init); for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize) { if (__lacc[__idx]) - __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]); + __assign(static_cast<__tuple_type>(__in_rng[__idx]), + __out_rng[__lacc[__idx + __elems_per_wg]]); } const ::std::uint16_t __residual = __n % _WGSize; @@ -643,7 +644,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W { auto __idx = __residual_start + __item_id; if (__lacc[__idx]) - __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]); + __assign(static_cast<__tuple_type>(__in_rng[__idx]), + __out_rng[__lacc[__idx + __elems_per_wg]]); } if (__item_id == 0) @@ -786,7 +788,8 @@ struct __gen_transform_input operator()(InRng&& __in_rng, std::size_t __idx) const { using _ValueType = oneapi::dpl::__internal::__value_t; - using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t::type>; + using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t< + typename std::invoke_result<_UnaryOp, _ValueType>::type>; return _OutValueType{__unary_op(__in_rng[__idx])}; } _UnaryOp __unary_op; @@ -808,19 +811,46 @@ struct __simple_write_to_idx }; template -struct __gen_count_pred +struct __gen_mask +{ + template + bool + operator()(_InRng&& __in_rng, std::size_t __idx) const + { + return __pred(__in_rng[__idx]); + } + _Predicate __pred; +}; + +template +struct __gen_unique_mask +{ + template + bool + operator()(_InRng&& __in_rng, std::size_t __idx) const + { + if (__idx == 0) + return true; + else + return !__pred(__in_rng[__idx], __in_rng[__idx - 1]); + } + _BinaryPredicate __pred; +}; + +template +struct __gen_count_mask { template _SizeType operator()(_InRng&& __in_rng, _SizeType __idx) const { - return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0}; + return __gen_mask(std::forward<_InRng>(__in_rng), __idx) ? _SizeType{1} : _SizeType{0}; } - _Predicate __pred; + _GenMask __gen_mask; }; -template -struct __gen_expand_count_pred +template +struct __gen_expand_count_mask { template auto @@ -832,10 +862,10 @@ struct __gen_expand_count_pred using _ElementType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t>; _ElementType ele = __in_rng[__idx]; - bool mask = __pred(ele); + bool mask = __gen_mask(__in_rng, __idx); return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele); } - _Predicate __pred; + _GenMask __gen_mask; }; struct __get_zeroth_element @@ -847,7 +877,7 @@ struct __get_zeroth_element return std::get<0>(std::forward<_Tp>(__a)); } }; - +template struct __write_to_idx_if { template @@ -860,8 +890,27 @@ struct __write_to_idx_if typename oneapi::dpl::__internal::__get_tuple_type(__v))>, std::decay_t>::__type; if (std::get<1>(__v)) - __out_rng[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v)); + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1]); } + Assign __assign; +}; + +template +struct __write_to_idx_if_else +{ + template + void + operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const + { + using _ConvertedTupleType = + typename oneapi::dpl::__internal::__get_tuple_type(__v))>, + std::decay_t>::__type; + if (std::get<1>(__v)) + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out[std::get<0>(__v) - 1])); + else + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<1>(__out[__idx - std::get<0>(__v)])); + } + Assign __assign; }; template + template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred, + typename _Assign = oneapi::dpl::__internal::__pstl_assign> auto - operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred) + operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred, + _Assign&& __assign) { constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size); constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size); @@ -958,71 +1009,47 @@ struct __invoke_single_group_copy_if if (__is_full_group) return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, true, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::true_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, + /* _IsFullGroup= */ std::true_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); else return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, false, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, - /* _IsFullGroup= */ std::false_type, _CustomName>> - >()( - __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{}, - _ReduceOp{}, ::std::forward<_Pred>(__pred)); + ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, + /* _IsFullGroup= */ std::false_type, _CustomName>>>()( + __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); } }; -template +template auto __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, - _CopyByMaskOp __copy_by_mask_op) + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op) { using _ReduceOp = ::std::plus<_Size>; - using _Assigner = unseq_backend::__scan_assigner; - using _NoAssign = unseq_backend::__scan_no_assign; - using _MaskAssigner = unseq_backend::__mask_assigner<1>; - using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; - using _InitType = unseq_backend::__no_init_value<_Size>; - - _Assigner __assign_op; - _ReduceOp __reduce_op; - _DataAcc __get_data_op; - _MaskAssigner __add_mask_op; - - // temporary buffer to store boolean mask - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n); - - return __parallel_transform_scan_base( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view( - ::std::forward<_InRng>(__in_rng), - oneapi::dpl::__ranges::all_view( - __mask_buf.get_buffer())), - ::std::forward<_OutRng>(__out_rng), __reduce_op, _InitType{}, - // local scan - unseq_backend::__scan{__reduce_op, __get_data_op, __assign_op, - __add_mask_op, __create_mask_op}, - // scan between groups - unseq_backend::__scan{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, - __get_data_op}, - // global scan - __copy_by_mask_op); + + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), + oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>{__generate_mask}, _ReduceOp{}, + oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, + oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, __write_op, + oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, + /*_Inclusive=*/std::true_type{}); } -template +template auto __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred) + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign&& __assign = _Assign{}) { using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; @@ -1047,20 +1074,23 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>; return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng), - ::std::forward<_OutRng>(__out_rng), __pred); + _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign)); } // Reduce-then-scan performs poorly on CPUs due to sub-group operations. else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) { using _ReduceOp = std::plus<_Size>; + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; + _GenMask __generate_mask{__pred}; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Pred>{__pred}, - _ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Pred>{__pred}, + std::forward<_OutRng>(__out_rng), + oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>{__generate_mask}, _ReduceOp{}, + oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if{}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<_Assign>{std::forward<_Assign>(__assign)}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index a2bfef10fb0..210845ec0b5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -237,10 +237,22 @@ template struct __gen_transform_input; template -struct __gen_count_pred; +struct __gen_mask; -template -struct __gen_expand_count_pred; +template +struct __gen_unique_mask; + +template +struct __gen_count_mask; + +template +struct __gen_expand_count_mask; + +template +struct __write_to_idx_if; + +template +struct __write_to_idx_if_else; template struct __early_exit_find_or; @@ -255,15 +267,37 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_pred, _Predicate)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)> : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate> { }; -template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred, - _Predicate)> - : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate> +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, _BinaryPredicate)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPredicate> +{ +}; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)> + : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask, _GenMask)> + : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, Assign)> + : oneapi::dpl::__internal::__are_all_device_copyable +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else, Assign)> + : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h index 969b05ab914..1c3ef965495 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h @@ -91,35 +91,16 @@ oneapi::dpl::__internal::__difference_t<_Range2> __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { - if (__rng1.empty()) + auto __n = __rng1.size(); + if (__n == 0) return 0; - oneapi::dpl::__internal::__difference_t<_Range2> __rng1_size = __rng1.size(); - - using _Type = typename _InitType::__value_type; - using _Assigner = unseq_backend::__scan_assigner; - using _NoAssign = unseq_backend::__scan_no_assign; - using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>; - using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; - - _Assigner __assign_op; - _NoAssign __no_assign_op; - _NoOpFunctor __get_data_op; - - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_base( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), - ::std::forward<_Range2>(__rng2), __binary_op, __init, - // local scan - unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, - _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op, - __get_data_op}, - // scan between groups - unseq_backend::__scan>{ - __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, - // global scan - unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}) - .wait(); - return __rng1_size; + + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(_BackendTag{}, + std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __unary_op, + __init, __binary_op, _Inclusive{}).wait(); + return __n; } template >, "__gen_transform_input is not device copyable with device copyable types"); - //__gen_count_pred - static_assert(sycl::is_device_copyable_v>, - "__gen_count_pred is not device copyable with device copyable types"); - - //__gen_expand_count_pred + //__gen_mask static_assert( - sycl::is_device_copyable_v>, - "__gen_expand_count_pred is not device copyable with device copyable types"); + sycl::is_device_copyable_v>, + // "__gen_mask is not device copyable with device copyable types"); + + //__gen_unique_mask + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_unique_mask>, + "__gen_unique_mask is not device copyable with device copyable types"); + + //__gen_count_mask + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_count_mask>>, + "__gen_count_mask is not device copyable with device copyable types"); + + //__gen_expand_count_mask + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask>>, + "__gen_expand_count_mask is not device copyable with device copyable types"); + + //__write_to_idx_if + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_to_idx_if>, + "__write_to_idx_if is not device copyable with device copyable types"); + + //__write_to_idx_if_else + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else>, + "__write_to_idx_if_else is not device copyable with device copyable types"); // __early_exit_find_or static_assert( @@ -357,20 +378,39 @@ test_non_device_copyable() oneapi::dpl::unseq_backend::__brick_reduce_idx>, "__brick_reduce_idx is device copyable with non device copyable types"); - // //__gen_transform_input + //__gen_transform_input static_assert( !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); - //__gen_count_pred + //__gen_mask + static_assert(!sycl::is_device_copyable_v>, + "__gen_mask is device copyable with non device copyable types"); + + //__gen_unique_mask + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_unique_mask>, + "__gen_unique_mask is device copyable with non device copyable types"); + + //__gen_count_mask + static_assert(!sycl::is_device_copyable_v>>, + "__gen_count_mask is device copyable with non device copyable types"); + + //__gen_expand_count_mask + static_assert(!sycl::is_device_copyable_v>>, + "__gen_expand_count_mask is device copyable with non device copyable types"); + + //__write_to_idx_if static_assert( - !sycl::is_device_copyable_v>, - "__gen_count_pred is device copyable with non device copyable types"); + !sycl::is_device_copyable_v>, + "__write_to_idx_if is device copyable with non device copyable types"); - //__gen_expand_count_pred + //__write_to_idx_if_else static_assert(!sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred>, - "__gen_expand_count_pred is device copyable with non device copyable types"); + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else>, + "__write_to_idx_if_else is device copyable with non device copyable types"); // __early_exit_find_or static_assert( diff --git a/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp index 8d8d8e6c20c..f2388ecef8d 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp @@ -40,7 +40,7 @@ struct run_unique ForwardIt k = unique(exec, first2, last2); auto n = ::std::distance(first1, i); - EXPECT_TRUE(::std::distance(first2, k) == n, "wrong return value from unique without predicate"); + EXPECT_EQ(::std::distance(first2, k), n, "wrong return value from unique without predicate"); EXPECT_EQ_N(first1, first2, n, "wrong effect from unique without predicate"); } }; @@ -63,7 +63,7 @@ struct run_unique_predicate ForwardIt k = unique(exec, first2, last2, pred); auto n = ::std::distance(first1, i); - EXPECT_TRUE(::std::distance(first2, k) == n, "wrong return value from unique with predicate"); + EXPECT_EQ(::std::distance(first2, k), n, "wrong return value from unique with predicate"); EXPECT_EQ_N(first1, first2, n, "wrong effect from unique with predicate"); } }; diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h index 7b98de6501c..2e83eadf3e6 100644 --- a/test/support/utils_device_copyable.h +++ b/test/support/utils_device_copyable.h @@ -48,6 +48,54 @@ struct noop_non_device_copyable } }; +// Device copyable assignment callable. +// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not +// relying on trivial copyability +struct assign_non_device_copyable +{ + assign_non_device_copyable(const assign_non_device_copyable& other) { std::cout << "non trivial copy ctor\n"; } + template + void + operator()(const _Xp& __x, _Yp&& __y) const + { + ::std::forward<_Yp>(__y) = __x; + } +}; + +struct assign_device_copyable +{ + assign_device_copyable(const assign_device_copyable& other) { std::cout << "non trivial copy ctor\n"; } + template + void + operator()(const _Xp& __x, _Yp&& __y) const + { + ::std::forward<_Yp>(__y) = __x; + } +}; + +// Device copyable binary operator binary operators. +// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not +// relying on trivial copyability +struct binary_op_non_device_copyable +{ + binary_op_non_device_copyable(const binary_op_non_device_copyable& other) { std::cout << " non trivial copy ctor\n"; } + int + operator()(int a, int b) const + { + return a; + } +}; + +struct binary_op_device_copyable +{ + binary_op_device_copyable(const binary_op_device_copyable& other) { std::cout << " non trivial copy ctor\n"; } + int + operator()(int a, int b) const + { + return a; + } +}; + // Device copyable int wrapper struct used in testing as surrogate for values, value types, etc. // Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not // relying on trivial copyability @@ -160,6 +208,16 @@ struct sycl::is_device_copyable : std::true_typ { }; +template <> +struct sycl::is_device_copyable : std::true_type +{ +}; + +template <> +struct sycl::is_device_copyable : std::true_type +{ +}; + template <> struct sycl::is_device_copyable : std::true_type { From eeeb3cd4f0986e9eca7bacd9a472866c52283257 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> Date: Thu, 1 Aug 2024 11:09:26 -0400 Subject: [PATCH 02/15] [PROTOTYPE] Optimization for unique (#1743) Unique Optimization and specializing n==1 --------- Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 25 +++++-- .../hetero/algorithm_ranges_impl_hetero.h | 8 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 23 +++--- .../parallel_backend_sycl_reduce_then_scan.h | 74 +++++++++++++------ .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 6 +- .../device_copyable.pass.cpp | 6 +- 6 files changed, 92 insertions(+), 50 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index b361405306b..30e66f76202 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -886,10 +886,10 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat //------------------------------------------------------------------------ template + typename _GenMask, typename _WriteOp, typename _IsUniquePattern> ::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type> __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last, - _IteratorOrTuple __output_first, _GenMask __gen_mask, _WriteOp __write_op) + _IteratorOrTuple __output_first, _GenMask __gen_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; @@ -898,15 +898,26 @@ __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Itera _It1DifferenceType __n = __last - __first; + if constexpr (_IsUniquePattern::value) + { + if (__n == 1) + { + oneapi::dpl::__internal::__pattern_walk2_brick( + __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __output_first, + oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}); + return std::make_pair(__output_first + 1, 1); + } + } + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf1 = __keep1(__first, __last); auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>(); auto __buf2 = __keep2(__output_first, __output_first + __n); - auto __res = - __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __gen_mask, __write_op); + auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __buf1.all_view(), __buf2.all_view(), __n, __gen_mask, + __write_op, __is_unique_pattern); ::std::size_t __num_copied = __res.get(); return ::std::make_pair(__output_first + __n, __num_copied); @@ -958,7 +969,7 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)), oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}); + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, /*_IsUniquePattern=*/std::false_type{}); return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second)); } @@ -978,7 +989,7 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec auto __result = __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first, oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if{}); + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{}, /*_IsUniquePattern=*/std::true_type{}); return __result_first + __result.second; } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index c5f688441fa..d2fd5e4b60d 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -335,10 +335,10 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _ //------------------------------------------------------------------------ template + typename _WriteOp, typename _IsUniquePattern> oneapi::dpl::__internal::__difference_t<_Range1> __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _GenMask __gen_mask, _WriteOp __write_op) + _GenMask __gen_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { auto __n = __rng1.size(); if (__n == 0) @@ -346,7 +346,7 @@ __pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __gen_mask, __write_op); + std::forward<_Range2>(__rng2), __n, __gen_mask, __write_op, __is_unique_pattern); return __res.get(); } @@ -408,7 +408,7 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec return __pattern_scan_copy(__tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if{std::forward<_Assign>(__assign)}); + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, /*_IsUniquePattern=*/std::true_type{}); } //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c861afc852d..1f00739006f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -829,10 +829,8 @@ struct __gen_unique_mask bool operator()(_InRng&& __in_rng, std::size_t __idx) const { - if (__idx == 0) - return true; - else - return !__pred(__in_rng[__idx], __in_rng[__idx - 1]); + //starting index is offset to 1 for "unique" patterns and 0th element copy is handled separately + return !__pred(__in_rng[__idx], __in_rng[__idx - 1]); } _BinaryPredicate __pred; }; @@ -877,7 +875,7 @@ struct __get_zeroth_element return std::get<0>(std::forward<_Tp>(__a)); } }; -template +template struct __write_to_idx_if { template @@ -890,7 +888,7 @@ struct __write_to_idx_if typename oneapi::dpl::__internal::__get_tuple_type(__v))>, std::decay_t>::__type; if (std::get<1>(__v)) - __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1]); + __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1 + __offset]); } Assign __assign; }; @@ -951,7 +949,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}); + oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{} /*_IsUniquePattern=*/std::false_type{}); } } { @@ -1028,10 +1026,11 @@ struct __invoke_single_group_copy_if }; template + typename _WriteOp, typename _IsUniquePattern> auto __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op) + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, + _IsUniquePattern __is_unique_pattern) { using _ReduceOp = ::std::plus<_Size>; @@ -1042,7 +1041,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, - /*_Inclusive=*/std::true_type{}); + /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } template {__generate_mask}, _ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<_Assign>{std::forward<_Assign>(__assign)}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>{std::forward<_Assign>(__assign)}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, - /*_Inclusive=*/std::true_type{}); + /*_Inclusive=*/std::true_type{}, /*Unique=*/std::false_type{}); } else { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index d216e8e36ac..814dc1e089e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -266,13 +266,15 @@ template class __reduce_then_scan_scan_kernel; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType, + typename _KernelName> struct __parallel_reduce_then_scan_reduce_submitter; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType, + typename... _KernelName> struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, - _GenReduceInput, _ReduceOp, _InitType, + __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory @@ -302,7 +304,11 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry; std::size_t __group_start_idx = (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local); - + if constexpr (__is_unique_pattern_v) + { + // for unique patterns, the first element is always copied to the output, so we need to skip it + __group_start_idx += 1; + } std::size_t __elements_in_group = std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group)); std::uint32_t __active_subgroups = @@ -400,16 +406,16 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu }; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, + typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename _KernelName> struct __parallel_reduce_then_scan_scan_submitter; template + bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, + typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename... _KernelName> struct __parallel_reduce_then_scan_scan_submitter< - __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput, - _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> + __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, + _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> { template @@ -456,6 +462,11 @@ struct __parallel_reduce_then_scan_scan_submitter< auto __group_start_idx = (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local); + if constexpr (__is_unique_pattern_v) + { + // for unique patterns, the first element is always copied to the output, so we need to skip it + __group_start_idx += 1; + } std::size_t __elements_in_group = std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group)); @@ -609,8 +620,17 @@ struct __parallel_reduce_then_scan_scan_submitter< oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op); __sub_group_carry.__setup(__value); } - else + else // zeroth block, group and subgroup { + if constexpr (__is_unique_pattern_v) + { + if (__sub_group_local_id == 0) + { + // For unique patterns, always copy the 0th element to the output + __write_op.__assign(__in_rng[0], __out_rng[0]); + } + } + if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>) { @@ -673,7 +693,14 @@ struct __parallel_reduce_then_scan_scan_submitter< { if (__block_num + 1 == __num_blocks) { - __res_ptr[0] = __sub_group_carry.__v; + if constexpr (__is_unique_pattern_v) + { + __res_ptr[0] = __sub_group_carry.__v + 1; + } + else + { + __res_ptr[0] = __sub_group_carry.__v; + } } else { @@ -681,7 +708,6 @@ struct __parallel_reduce_then_scan_scan_submitter< __set_block_carry_out(__block_num, __tmp_ptr, __sub_group_carry.__v); } } - __sub_group_carry.__destroy(); }); }); @@ -714,13 +740,13 @@ struct __parallel_reduce_then_scan_scan_submitter< // and performs the final write to output operation template + typename _Inclusive, typename _IsUniquePattern> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, - _ScanInputTransform __scan_input_transform, _WriteOp __write_op, - _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive) + _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, + _Inclusive, _IsUniquePattern) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -733,6 +759,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // Empirically determined maximum. May be less for non-full blocks. constexpr std::uint8_t __max_inputs_per_item = 128; constexpr bool __inclusive = _Inclusive::value; + constexpr bool __is_unique_pattern_v = _IsUniquePattern::value; // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough // that no check is needed? @@ -747,14 +774,19 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::size_t __n = __in_rng.size(); const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __num_remaining = __n; + if constexpr (__is_unique_pattern_v) + { + // skip scan of zeroth element in unique patterns + __num_remaining -= 1; + } auto __inputs_per_sub_group = - __n >= __max_inputs_per_block + __num_remaining >= __max_inputs_per_block ? __max_inputs_per_block / __num_sub_groups_global : std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global); auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size; - const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block; - const auto __num_blocks = __n / __block_size + (__n % __block_size != 0); + const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block; + const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0); //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the // block carry-out. We need two for the block carry-out to prevent a race condition between reading and writing @@ -764,10 +796,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // Reduce and scan step implementations using _ReduceSubmitter = - __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, + __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>; using _ScanSubmitter = - __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, + __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, _ScanKernel>; // TODO: remove below before merging. used for convenience now diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 210845ec0b5..8184b5d5369 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -248,7 +248,7 @@ struct __gen_count_mask; template struct __gen_expand_count_mask; -template +template struct __write_to_idx_if; template @@ -289,8 +289,8 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; -template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, Assign)> +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset, Assign)> : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 9ac35baf5c8..1367df1151b 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -153,7 +153,7 @@ test_device_copyable() //__gen_mask static_assert( sycl::is_device_copyable_v>, - // "__gen_mask is not device copyable with device copyable types"); + "__gen_mask is not device copyable with device copyable types"); //__gen_unique_mask static_assert(sycl::is_device_copyable_v< @@ -172,7 +172,7 @@ test_device_copyable() //__write_to_idx_if static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__write_to_idx_if>, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_device_copyable>>, "__write_to_idx_if is not device copyable with device copyable types"); //__write_to_idx_if_else @@ -404,7 +404,7 @@ test_non_device_copyable() //__write_to_idx_if static_assert( - !sycl::is_device_copyable_v>, + !sycl::is_device_copyable_v>, "__write_to_idx_if is device copyable with non device copyable types"); //__write_to_idx_if_else From 531f2a4809e7f90ac93062d7b30171b2507750d7 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 11:57:44 -0400 Subject: [PATCH 03/15] fixes for refactor decide alg at __parallel level Signed-off-by: Dan Hoeflinger fixes for previous commit (squash) Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 84 ++++----- .../hetero/algorithm_ranges_impl_hetero.h | 36 ++-- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 178 ++++++++++++------ .../parallel_backend_sycl_reduce_then_scan.h | 5 + .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 22 ++- 5 files changed, 194 insertions(+), 131 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 30e66f76202..fa52c80886e 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -885,44 +885,6 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat // copy_if //------------------------------------------------------------------------ -template -::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type> -__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last, - _IteratorOrTuple __output_first, _GenMask __gen_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) -{ - using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - - if (__first == __last) - return ::std::make_pair(__output_first, _It1DifferenceType{0}); - - _It1DifferenceType __n = __last - __first; - - if constexpr (_IsUniquePattern::value) - { - if (__n == 1) - { - oneapi::dpl::__internal::__pattern_walk2_brick( - __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __output_first, - oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}); - return std::make_pair(__output_first + 1, 1); - } - } - - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); - auto __buf1 = __keep1(__first, __last); - auto __keep2 = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>(); - auto __buf2 = __keep2(__output_first, __output_first + __n); - - auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __gen_mask, - __write_op, __is_unique_pattern); - - ::std::size_t __num_copied = __res.get(); - return ::std::make_pair(__output_first + __n, __num_copied); -} - template _Iterator2 @@ -963,15 +925,22 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; - auto __result = __pattern_scan_copy( - __tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __par_backend_hetero::zip( - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)), - oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, /*_IsUniquePattern=*/std::false_type{}); + _It1DifferenceType __n = __last - __first; - return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second)); + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __buf1 = __keep1(__first, __last); + + auto __zipped_res = __par_backend_hetero::zip( + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)); + + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>(); + auto __buf2 = __keep2(__zipped_res, __zipped_res + __n); + + auto __result = + oneapi::dpl::__par_backend_hetero::__parallel_partition_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + + return ::std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get())); } //------------------------------------------------------------------------ @@ -986,12 +955,27 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec { using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type; + _It1DifferenceType __n = __last - __first; + + if (__n == 0) + return __result_first; + if (__n == 1) + { + oneapi::dpl::__internal::__pattern_walk2_brick( + __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first, + oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}); + return __result_first + 1; + } + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __buf1 = __keep1(__first, __last); + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); + auto __buf2 = __keep2(__result_first, __result_first + __n); + auto __result = - __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first, - oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{}, /*_IsUniquePattern=*/std::true_type{}); + oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); - return __result_first + __result.second; + return __result_first + __result.get(); } template diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index d2fd5e4b60d..05459d3d51e 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -334,22 +334,6 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _ // copy_if //------------------------------------------------------------------------ -template -oneapi::dpl::__internal::__difference_t<_Range1> -__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _GenMask __gen_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) -{ - auto __n = __rng1.size(); - if (__n == 0) - return 0; - - auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __gen_mask, __write_op, __is_unique_pattern); - return __res.get(); -} - template oneapi::dpl::__internal::__difference_t<_Range2> @@ -405,10 +389,22 @@ oneapi::dpl::__internal::__difference_t<_Range2> __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred, _Assign&& __assign) { - return __pattern_scan_copy(__tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), - oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, /*_IsUniquePattern=*/std::true_type{}); + auto __n = __rng.size(); + if (__n == 0) + return 0; + if (__n == 1) + { + using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>; + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng), + std::forward<_Range2>(__result)).get(); + + return 1; + } + + return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign)).get(); } //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 1f00739006f..bbd7c0e6155 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -260,29 +260,6 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& //------------------------------------------------------------------------ // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -template -struct __global_scan_caller -{ - __global_scan_caller(const _GlobalScan& __global_scan, const _Range2& __rng2, const _Range1& __rng1, - const _Accessor& __wg_sums_acc, _Size __n, ::std::size_t __size_per_wg) - : __m_global_scan(__global_scan), __m_rng2(__rng2), __m_rng1(__rng1), __m_wg_sums_acc(__wg_sums_acc), - __m_n(__n), __m_size_per_wg(__size_per_wg) - { - } - - void operator()(sycl::item<1> __item) const - { - __m_global_scan(__item, __m_rng2, __m_rng1, __m_wg_sums_acc, __m_n, __m_size_per_wg); - } - - private: - _GlobalScan __m_global_scan; - _Range2 __m_rng2; - _Range1 __m_rng1; - _Accessor __m_wg_sums_acc; - _Size __m_n; - ::std::size_t __m_size_per_wg; -}; // Please see the comment for __parallel_for_submitter for optional kernel name explanation template @@ -327,14 +304,16 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __size_per_wg = __iters_per_witem * __wgroup_size; auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_wg); // Storage for the results of scan for each workgroup - sycl::buffer<_Type> __wg_sums(__n_groups); + + using _TempStorage = __result_and_scratch_storage, _Type>; + _TempStorage __result_and_scratch{__exec, __n_groups + 1}; _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); // 1. Local scan on each workgroup auto __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer - auto __wg_sums_acc = __wg_sums.template get_access(__cgh); + auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle()); @@ -344,7 +323,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name __kernel_1, #endif sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) { - __local_scan(__item, __n, __local_acc, __rng1, __rng2, __wg_sums_acc, __size_per_wg, __wgroup_size, + auto __temp_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__temp_acc); + __local_scan(__item, __n, __local_acc, __rng1, __rng2, __temp_ptr, __size_per_wg, __wgroup_size, __iters_per_witem, __init); }); }); @@ -354,7 +334,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __iters_per_single_wg = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __wgroup_size); __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); - auto __wg_sums_acc = __wg_sums.template get_access(__cgh); + auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle()); @@ -365,8 +345,9 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name #endif // TODO: try to balance work between several workgroups instead of one sycl::nd_range<1>(__wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) { - __group_scan(__item, __n_groups, __local_acc, __wg_sums_acc, __wg_sums_acc, - /*dummy*/ __wg_sums_acc, __n_groups, __wgroup_size, __iters_per_single_wg); + auto __temp_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__temp_acc); + __group_scan(__item, __n_groups, __local_acc, __temp_ptr, __temp_ptr, + /*dummy*/ __temp_ptr, __n_groups, __wgroup_size, __iters_per_single_wg); }); }); } @@ -375,15 +356,17 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __final_event = __exec.queue().submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer - auto __wg_sums_acc = __wg_sums.template get_access(__cgh); + auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); + auto __res_acc = __result_and_scratch.__get_result_acc(__cgh); __cgh.parallel_for<_PropagateScanName...>( - sycl::range<1>(__n_groups * __size_per_wg), - __global_scan_caller<_GlobalScan, ::std::decay_t<_Range2>, ::std::decay_t<_Range1>, - decltype(__wg_sums_acc), decltype(__n)>(__global_scan, __rng2, __rng1, - __wg_sums_acc, __n, __size_per_wg)); + sycl::range<1>(__n_groups * __size_per_wg),[=](auto __item) { + auto __temp_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__temp_acc); + auto __res_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__res_acc, __n_groups + 1); + __global_scan(__item, __rng2, __rng1, __temp_ptr, __res_ptr, __n, __size_per_wg); + }); }); - return __future(__final_event, sycl::buffer(__wg_sums, sycl::id<1>(__n_groups - 1), sycl::range<1>(1))); + return __future(__final_event, __result_and_scratch); } }; @@ -949,7 +932,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{} /*_IsUniquePattern=*/std::false_type{}); + oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); } } { @@ -962,13 +945,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoAssign __no_assign_op; _NoOpFunctor __get_data_op; - // Although we do not actually need result storage in this case, we need to construct - // a placeholder here to match the return type of reduce-then-scan - using _TempStorage = __result_and_scratch_storage, _Type>; - _TempStorage __dummy_result_and_scratch{__exec, 0}; - return - __future(__parallel_transform_scan_base( + __parallel_transform_scan_base( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __binary_op, __init, // local scan @@ -980,8 +958,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{ __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, // global scan - unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(), - __dummy_result_and_scratch); + unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}); } } @@ -1028,12 +1005,11 @@ struct __invoke_single_group_copy_if template auto -__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { - using _ReduceOp = ::std::plus<_Size>; - + using _ReduceOp = std::plus<_Size>; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), @@ -1044,6 +1020,104 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } +template +auto +__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, + _CopyByMaskOp __copy_by_mask_op) +{ + using _ReduceOp = std::plus<_Size>; + using _Assigner = unseq_backend::__scan_assigner; + using _NoAssign = unseq_backend::__scan_no_assign; + using _MaskAssigner = unseq_backend::__mask_assigner<1>; + using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; + using _InitType = unseq_backend::__no_init_value<_Size>; + + _Assigner __assign_op; + _ReduceOp __reduce_op; + _DataAcc __get_data_op; + _MaskAssigner __add_mask_op; + + // temporary buffer to store boolean mask + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n); + + return __parallel_transform_scan_base( + __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::zip_view( + ::std::forward<_InRng>(__in_rng), + oneapi::dpl::__ranges::all_view( + __mask_buf.get_buffer())), + ::std::forward<_OutRng>(__out_rng), __reduce_op, _InitType{}, + // local scan + unseq_backend::__scan{__reduce_op, __get_data_op, __assign_op, + __add_mask_op, __create_mask_op}, + // scan between groups + unseq_backend::__scan{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, + __get_data_op}, + // global scan + __copy_by_mask_op); + +} + +template +auto +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, + _BinaryPredicate __pred, _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{}) +{ + + auto __n = __rng.size(); + // choice between legacy and reduce_then_scan + const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); + if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + { + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, + oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, /*_IsUniquePattern=*/std::true_type{}); + } + else + { + unseq_backend::__copy_by_mask<::std::plus, oneapi::dpl::__internal::__pstl_assign, + /*inclusive*/ ::std::true_type, 1> + __copy_by_mask_op; + oneapi::dpl::__internal::__create_mask_unique_copy, decltype(__n)> __create_mask_op{ + oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; + + return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op); + } +} + + +template +auto +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, + _UnaryPredicate __pred) +{ + auto __n = __rng.size(); + // choice between legacy and reduce_then_scan + const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); + if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + { + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, /*_IsUniquePattern=*/std::false_type{}); + } + else + { + using _ReduceOp = ::std::plus; + + unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred}; + unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __partition_by_mask{_ReduceOp{}}; + + return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask); + } +} + template auto @@ -1099,16 +1173,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using CreateOp = unseq_backend::__create_mask<_Pred, _Size>; using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 1>; - // Although we do not actually need result storage in this case, we need to construct - // a placeholder here to match the return type of reduce-then-scan - using _TempStorage = __result_and_scratch_storage, _Size>; - _TempStorage __dummy_result_and_scratch{__exec, 0}; - return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - CreateOp{__pred}, CopyOp{}) - .event(), - __dummy_result_and_scratch); + CreateOp{__pred}, CopyOp{}); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 814dc1e089e..301ac32d527 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -824,11 +824,16 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. + std::cout <<"reduce\n"; __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, __inputs_per_sub_group, __inputs_per_item, __b); + __event.wait(); + std::cout <<"scan\n"; // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, __inputs_per_sub_group, __inputs_per_item, __b); + __event.wait(); + if (__num_remaining > __block_size) { // Resize for the next block. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 2a6145bf182..351712ae587 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -578,10 +578,10 @@ struct __copy_by_mask _BinaryOp __binary_op; _Assigner __assigner; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, const _RetAcc& __ret_acc, _Size __n, _SizePerWg __size_per_wg) const { using ::std::get; @@ -617,6 +617,11 @@ struct __copy_by_mask // is performed(i.e. __typle_type is the same type as its operand). __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); } + if (__item_idx == 0) + { + //copy final result to output + __ret_acc[0] = __wg_sums_acc[(__n-1) / __size_per_wg]; + } } }; @@ -625,10 +630,10 @@ struct __partition_by_mask { _BinaryOp __binary_op; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, const _RetAcc& __ret_acc, _Size __n, _SizePerWg __size_per_wg) const { auto __item_idx = __item.get_linear_id(); @@ -660,6 +665,11 @@ struct __partition_by_mask get<1>(__out_acc[__out_idx]) = static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])); } } + if (__item_idx == 0) + { + //copy final result to output + __ret_acc[0] = __wg_sums_acc[(__n-1) / __size_per_wg]; + } } }; @@ -669,10 +679,10 @@ struct __global_scan_functor _BinaryOp __binary_op; _InitType __init; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, const _WgSumsAcc& __wg_sums_acc, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, const _WgSumsAcc& __wg_sums_acc, const _RetAcc&, _Size __n, _SizePerWg __size_per_wg) const { constexpr auto __shift = _Inclusive{} ? 0 : 1; From 838d2af8151149e8b28db64e847245c20526a8ac Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 12:53:57 -0400 Subject: [PATCH 04/15] get rid of double dereferences (time this to see if its worth it the changes) Signed-off-by: Dan Hoeflinger --- .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 56 +++++++++++-------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 351712ae587..e78d854d05d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -529,12 +529,20 @@ struct __mask_assigner struct __scan_assigner { template - void + std::enable_if_t> operator()(_OutAcc& __out_acc, const _OutIdx __out_idx, const _InAcc& __in_acc, _InIdx __in_idx) const { __out_acc[__out_idx] = __in_acc[__in_idx]; } + template + std::enable_if_t> + operator()(_OutAcc __out_acc, const _OutIdx __out_idx, const _InAcc& __in_acc, _InIdx __in_idx) const + { + __out_acc[__out_idx] = __in_acc[__in_idx]; + } + + template void operator()(_Acc&, _OutAcc& __out_acc, const _OutIdx __out_idx, const _InAcc& __in_acc, _InIdx __in_idx) const @@ -578,10 +586,10 @@ struct __copy_by_mask _BinaryOp __binary_op; _Assigner __assigner; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, const _RetAcc& __ret_acc, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, _Size __n, _SizePerWg __size_per_wg) const { using ::std::get; @@ -598,7 +606,7 @@ struct __copy_by_mask if (__item_idx >= __size_per_wg) { auto __wg_sums_idx = __item_idx / __size_per_wg - 1; - __out_idx = __binary_op(__out_idx, __wg_sums_acc[__wg_sums_idx]); + __out_idx = __binary_op(__out_idx, __wg_sums_ptr[__wg_sums_idx]); } if (__item_idx % __size_per_wg == 0 || (get(__in_acc[__item_idx]) != get(__in_acc[__item_idx - 1]))) // If we work with tuples we might have a situation when internal tuple is assigned to ::std::tuple @@ -620,7 +628,7 @@ struct __copy_by_mask if (__item_idx == 0) { //copy final result to output - __ret_acc[0] = __wg_sums_acc[(__n-1) / __size_per_wg]; + __ret_ptr[0] = __wg_sums_ptr[(__n-1) / __size_per_wg]; } } }; @@ -630,10 +638,10 @@ struct __partition_by_mask { _BinaryOp __binary_op; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, const _WgSumsAcc& __wg_sums_acc, const _RetAcc& __ret_acc, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, _Size __n, _SizePerWg __size_per_wg) const { auto __item_idx = __item.get_linear_id(); @@ -651,7 +659,7 @@ struct __partition_by_mask __in_type, ::std::decay_t(__out_acc[__out_idx]))>>::__type; if (__not_first_wg) - __out_idx = __binary_op(__out_idx, __wg_sums_acc[__wg_sums_idx - 1]); + __out_idx = __binary_op(__out_idx, __wg_sums_ptr[__wg_sums_idx - 1]); get<0>(__out_acc[__out_idx]) = static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])); } else @@ -661,14 +669,14 @@ struct __partition_by_mask __in_type, ::std::decay_t(__out_acc[__out_idx]))>>::__type; if (__not_first_wg) - __out_idx -= __wg_sums_acc[__wg_sums_idx - 1]; + __out_idx -= __wg_sums_ptr[__wg_sums_idx - 1]; get<1>(__out_acc[__out_idx]) = static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])); } } if (__item_idx == 0) { //copy final result to output - __ret_acc[0] = __wg_sums_acc[(__n-1) / __size_per_wg]; + __ret_ptr[0] = __wg_sums_ptr[(__n-1) / __size_per_wg]; } } }; @@ -679,10 +687,10 @@ struct __global_scan_functor _BinaryOp __binary_op; _InitType __init; - template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, const _WgSumsAcc& __wg_sums_acc, const _RetAcc&, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, _WgSumsPtr*__wg_sums_ptr, _RetPtr*, _Size __n, _SizePerWg __size_per_wg) const { constexpr auto __shift = _Inclusive{} ? 0 : 1; @@ -693,7 +701,7 @@ struct __global_scan_functor auto __wg_sums_idx = __item_idx / __size_per_wg - 1; // an initial value precedes the first group for the exclusive scan __item_idx += __shift; - auto __bin_op_result = __binary_op(__wg_sums_acc[__wg_sums_idx], __out_acc[__item_idx]); + auto __bin_op_result = __binary_op(__wg_sums_ptr[__wg_sums_idx], __out_acc[__item_idx]); using __out_type = ::std::decay_t; using __in_type = ::std::decay_t; __out_acc[__item_idx] = @@ -722,10 +730,10 @@ struct __scan _DataAccessor __data_acc; template + typename _WGSumsPtr, typename _SizePerWG, typename _WGSize, typename _ItersPerWG> void scan_impl(_NDItemId __item, _Size __n, _AccLocal& __local_acc, const _InAcc& __acc, _OutAcc& __out_acc, - _WGSumsAcc& __wg_sums_acc, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, + _WGSumsPtr* __wg_sums_ptr, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, _InitType __init, std::false_type /*has_known_identity*/) const { ::std::size_t __group_id = __item.get_group(0); @@ -795,18 +803,18 @@ struct __scan __gl_assigner(__acc, __out_acc, __adjusted_global_id + __shift, __local_acc, __local_id); if (__adjusted_global_id == __n - 1) - __wg_assigner(__wg_sums_acc, __group_id, __local_acc, __local_id); + __wg_assigner(__wg_sums_ptr, __group_id, __local_acc, __local_id); } if (__local_id == __wgroup_size - 1 && __adjusted_global_id - __wgroup_size < __n) - __wg_assigner(__wg_sums_acc, __group_id, __local_acc, __local_id); + __wg_assigner(__wg_sums_ptr, __group_id, __local_acc, __local_id); } template + typename _WGSumsPtr, typename _SizePerWG, typename _WGSize, typename _ItersPerWG> void scan_impl(_NDItemId __item, _Size __n, _AccLocal& __local_acc, const _InAcc& __acc, _OutAcc& __out_acc, - _WGSumsAcc& __wg_sums_acc, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, + _WGSumsPtr* __wg_sums_ptr, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, _InitType __init, std::true_type /*has_known_identity*/) const { auto __group_id = __item.get_group(0); @@ -841,21 +849,21 @@ struct __scan __gl_assigner(__acc, __out_acc, __adjusted_global_id + __shift, __local_acc, __local_id); if (__adjusted_global_id == __n - 1) - __wg_assigner(__wg_sums_acc, __group_id, __local_acc, __local_id); + __wg_assigner(__wg_sums_ptr, __group_id, __local_acc, __local_id); } if (__local_id == __wgroup_size - 1 && __adjusted_global_id - __wgroup_size < __n) - __wg_assigner(__wg_sums_acc, __group_id, __local_acc, __local_id); + __wg_assigner(__wg_sums_ptr, __group_id, __local_acc, __local_id); } template + typename _WGSumsPtr, typename _SizePerWG, typename _WGSize, typename _ItersPerWG> void operator()(_NDItemId __item, _Size __n, _AccLocal& __local_acc, const _InAcc& __acc, _OutAcc& __out_acc, - _WGSumsAcc& __wg_sums_acc, _SizePerWG __size_per_wg, _WGSize __wgroup_size, + _WGSumsPtr* __wg_sums_ptr, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, _InitType __init = __no_init_value{}) const { - scan_impl(__item, __n, __local_acc, __acc, __out_acc, __wg_sums_acc, __size_per_wg, __wgroup_size, + scan_impl(__item, __n, __local_acc, __acc, __out_acc, __wg_sums_ptr, __size_per_wg, __wgroup_size, __iters_per_wg, __init, __has_known_identity<_BinaryOperation, _Tp>{}); } }; From c11badbd282d00656e80930bb83cca62c3f4fa47 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 14:32:18 -0400 Subject: [PATCH 05/15] removing debugging code --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 301ac32d527..f56d7704492 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -824,15 +824,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - std::cout <<"reduce\n"; __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, __inputs_per_sub_group, __inputs_per_item, __b); - __event.wait(); - std::cout <<"scan\n"; // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, __inputs_per_sub_group, __inputs_per_item, __b); - __event.wait(); if (__num_remaining > __block_size) { From d09758b576349e971543cd19b5625b38a533e07c Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:07:35 -0400 Subject: [PATCH 06/15] formatting Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 11 +- .../hetero/algorithm_ranges_impl_hetero.h | 9 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 119 +++++++++--------- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 12 +- .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 23 ++-- .../pstl/hetero/numeric_ranges_impl_hetero.h | 9 +- .../device_copyable.pass.cpp | 37 +++--- test/support/utils_device_copyable.h | 9 +- 8 files changed, 122 insertions(+), 107 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index fa52c80886e..ddfc30046a0 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -934,11 +934,12 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)); - auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>(); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>(); auto __buf2 = __keep2(__zipped_res, __zipped_res + __n); - auto __result = - oneapi::dpl::__par_backend_hetero::__parallel_partition_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); return ::std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get())); } @@ -972,8 +973,8 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result_first, __result_first + __n); - auto __result = - oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); return __result_first + __result.get(); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 05459d3d51e..8ef423c7071 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -398,13 +398,16 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng), - std::forward<_Range2>(__result)).get(); + std::forward<_Range2>(__result)) + .get(); return 1; } - return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign)).get(); + return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign)) + .get(); } //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index bbd7c0e6155..1ddfd360749 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -358,12 +358,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); auto __res_acc = __result_and_scratch.__get_result_acc(__cgh); - __cgh.parallel_for<_PropagateScanName...>( - sycl::range<1>(__n_groups * __size_per_wg),[=](auto __item) { - auto __temp_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__temp_acc); - auto __res_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__res_acc, __n_groups + 1); - __global_scan(__item, __rng2, __rng1, __temp_ptr, __res_ptr, __n, __size_per_wg); - }); + __cgh.parallel_for<_PropagateScanName...>(sycl::range<1>(__n_groups * __size_per_wg), [=](auto __item) { + auto __temp_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__temp_acc); + auto __res_ptr = _TempStorage::__get_usm_or_buffer_accessor_ptr(__res_acc, __n_groups + 1); + __global_scan(__item, __rng2, __rng1, __temp_ptr, __res_ptr, __n, __size_per_wg); + }); }); return __future(__final_event, __result_and_scratch); @@ -506,8 +505,8 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem } } - __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, - __lacc_ptr, __bin_op, __init); + __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, __lacc_ptr, + __bin_op, __init); if constexpr (__can_use_subgroup_load_store) { @@ -685,7 +684,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op); else - __event = __parallel_transform_scan_static_single_group_submitter< + __event = __parallel_transform_scan_static_single_group_submitter< _Inclusive::value, __num_elems_per_item, __wg_size, /* _IsFullGroup= */ false, oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_single_wg_kernel< @@ -725,9 +724,10 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend using _DynamicGroupScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __par_backend_hetero::__scan_single_wg_dynamic_kernel<_BinaryOperation, _CustomName>>; - auto __event = __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( - ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - __n, __init, __binary_op, __unary_op, __max_wg_size); + auto __event = + __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( + ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size); return __future(__event, __dummy_result_and_scratch); } } @@ -932,7 +932,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}, + /*_IsUniquePattern=*/std::false_type{}); } } { @@ -945,20 +946,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoAssign __no_assign_op; _NoOpFunctor __get_data_op; - return - __parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __binary_op, __init, - // local scan - unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, - _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, - __assign_op, __assign_op, __get_data_op}, - // scan between groups - unseq_backend::__scan>{ - __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, - // global scan - unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}); + return __parallel_transform_scan_base( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __binary_op, __init, + // local scan + unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, + _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, + __assign_op, __get_data_op}, + // scan between groups + unseq_backend::__scan>{ + __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op}, + // global scan + unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}); } } @@ -1006,8 +1006,8 @@ template auto __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, - _IsUniquePattern __is_unique_pattern) + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, + _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { using _ReduceOp = std::plus<_Size>; return __parallel_transform_reduce_then_scan( @@ -1051,22 +1051,22 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag ::std::forward<_OutRng>(__out_rng), __reduce_op, _InitType{}, // local scan unseq_backend::__scan{__reduce_op, __get_data_op, __assign_op, - __add_mask_op, __create_mask_op}, + _MaskAssigner, _CreateMaskOp, _InitType>{__reduce_op, __get_data_op, __assign_op, + __add_mask_op, __create_mask_op}, // scan between groups unseq_backend::__scan{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, - __get_data_op}, + _Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, + __get_data_op}, // global scan __copy_by_mask_op); - } -template +template auto -__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, - _BinaryPredicate __pred, _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{}) +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred, + _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{}) { auto __n = __rng.size(); @@ -1074,37 +1074,43 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) { - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, /*_IsUniquePattern=*/std::true_type{}); + return __parallel_reduce_then_scan_copy( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, + oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, + /*_IsUniquePattern=*/std::true_type{}); } else { unseq_backend::__copy_by_mask<::std::plus, oneapi::dpl::__internal::__pstl_assign, - /*inclusive*/ ::std::true_type, 1> + /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op; - oneapi::dpl::__internal::__create_mask_unique_copy, decltype(__n)> __create_mask_op{ - oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; + oneapi::dpl::__internal::__create_mask_unique_copy, + decltype(__n)> + __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; - return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op); + return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + __create_mask_op, __copy_by_mask_op); } } - template auto -__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, - _UnaryPredicate __pred) +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { auto __n = __rng.size(); // choice between legacy and reduce_then_scan const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) { - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, /*_IsUniquePattern=*/std::false_type{}); + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, + /*_IsUniquePattern=*/std::false_type{}); } else { @@ -1113,8 +1119,9 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred}; unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __partition_by_mask{_ReduceOp{}}; - return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask); + return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + __create_mask_op, __partition_by_mask); } } @@ -1175,8 +1182,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, /*inclusive*/ std::true_type, 1>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - CreateOp{__pred}, CopyOp{}); + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + CreateOp{__pred}, CopyOp{}); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 8184b5d5369..9a935152446 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -273,7 +273,8 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, _BinaryPredicate)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, + _BinaryPredicate)> : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPredicate> { }; @@ -284,19 +285,22 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask, _GenMask)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask, + _GenMask)> : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask> { }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset, Assign)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset, + Assign)> : oneapi::dpl::__internal::__are_all_device_copyable { }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else, Assign)> +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else, + Assign)> : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index e78d854d05d..264cf952856 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -542,7 +542,6 @@ struct __scan_assigner __out_acc[__out_idx] = __in_acc[__in_idx]; } - template void operator()(_Acc&, _OutAcc& __out_acc, const _OutIdx __out_idx, const _InAcc& __in_acc, _InIdx __in_idx) const @@ -589,8 +588,8 @@ struct __copy_by_mask template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, _Size __n, - _SizePerWg __size_per_wg) const + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, + _Size __n, _SizePerWg __size_per_wg) const { using ::std::get; auto __item_idx = __item.get_linear_id(); @@ -628,7 +627,7 @@ struct __copy_by_mask if (__item_idx == 0) { //copy final result to output - __ret_ptr[0] = __wg_sums_ptr[(__n-1) / __size_per_wg]; + __ret_ptr[0] = __wg_sums_ptr[(__n - 1) / __size_per_wg]; } } }; @@ -641,8 +640,8 @@ struct __partition_by_mask template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, _Size __n, - _SizePerWg __size_per_wg) const + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc& __in_acc, _WgSumsPtr* __wg_sums_ptr, _RetPtr* __ret_ptr, + _Size __n, _SizePerWg __size_per_wg) const { auto __item_idx = __item.get_linear_id(); if (__item_idx < __n) @@ -676,7 +675,7 @@ struct __partition_by_mask if (__item_idx == 0) { //copy final result to output - __ret_ptr[0] = __wg_sums_ptr[(__n-1) / __size_per_wg]; + __ret_ptr[0] = __wg_sums_ptr[(__n - 1) / __size_per_wg]; } } }; @@ -690,7 +689,7 @@ struct __global_scan_functor template void - operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, _WgSumsPtr*__wg_sums_ptr, _RetPtr*, _Size __n, + operator()(_Item __item, _OutAcc& __out_acc, const _InAcc&, _WgSumsPtr* __wg_sums_ptr, _RetPtr*, _Size __n, _SizePerWg __size_per_wg) const { constexpr auto __shift = _Inclusive{} ? 0 : 1; @@ -858,10 +857,10 @@ struct __scan template - void operator()(_NDItemId __item, _Size __n, _AccLocal& __local_acc, const _InAcc& __acc, _OutAcc& __out_acc, - _WGSumsPtr* __wg_sums_ptr, _SizePerWG __size_per_wg, _WGSize __wgroup_size, - _ItersPerWG __iters_per_wg, - _InitType __init = __no_init_value{}) const + void + operator()(_NDItemId __item, _Size __n, _AccLocal& __local_acc, const _InAcc& __acc, _OutAcc& __out_acc, + _WGSumsPtr* __wg_sums_ptr, _SizePerWG __size_per_wg, _WGSize __wgroup_size, _ItersPerWG __iters_per_wg, + _InitType __init = __no_init_value{}) const { scan_impl(__item, __n, __local_acc, __acc, __out_acc, __wg_sums_ptr, __size_per_wg, __wgroup_size, __iters_per_wg, __init, __has_known_identity<_BinaryOperation, _Tp>{}); diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h index 1c3ef965495..55596ee6473 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h @@ -95,11 +95,10 @@ __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __ex if (__n == 0) return 0; - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(_BackendTag{}, - std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __unary_op, - __init, __binary_op, _Inclusive{}).wait(); + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), __n, __unary_op, __init, __binary_op, _Inclusive{}) + .wait(); return __n; } diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 1367df1151b..25f5fc2e608 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -151,34 +151,33 @@ test_device_copyable() "__gen_transform_input is not device copyable with device copyable types"); //__gen_mask - static_assert( - sycl::is_device_copyable_v>, - "__gen_mask is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v>, + "__gen_mask is not device copyable with device copyable types"); //__gen_unique_mask - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__gen_unique_mask>, - "__gen_unique_mask is not device copyable with device copyable types"); + static_assert( + sycl::is_device_copyable_v>, + "__gen_unique_mask is not device copyable with device copyable types"); //__gen_count_mask - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__gen_count_mask>>, + static_assert(sycl::is_device_copyable_v>>, "__gen_count_mask is not device copyable with device copyable types"); //__gen_expand_count_mask - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask>>, + static_assert(sycl::is_device_copyable_v>>, "__gen_expand_count_mask is not device copyable with device copyable types"); //__write_to_idx_if - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_device_copyable>>, - "__write_to_idx_if is not device copyable with device copyable types"); + static_assert( + sycl::is_device_copyable_v>, + "__write_to_idx_if is not device copyable with device copyable types"); //__write_to_idx_if_else - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else>, - "__write_to_idx_if_else is not device copyable with device copyable types"); + static_assert( + sycl::is_device_copyable_v>, + "__write_to_idx_if_else is not device copyable with device copyable types"); // __early_exit_find_or static_assert( @@ -403,9 +402,9 @@ test_non_device_copyable() "__gen_expand_count_mask is device copyable with non device copyable types"); //__write_to_idx_if - static_assert( - !sycl::is_device_copyable_v>, - "__write_to_idx_if is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_non_device_copyable>>, + "__write_to_idx_if is device copyable with non device copyable types"); //__write_to_idx_if_else static_assert(!sycl::is_device_copyable_v< diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h index 2e83eadf3e6..40fe1cf61cc 100644 --- a/test/support/utils_device_copyable.h +++ b/test/support/utils_device_copyable.h @@ -78,8 +78,11 @@ struct assign_device_copyable // relying on trivial copyability struct binary_op_non_device_copyable { - binary_op_non_device_copyable(const binary_op_non_device_copyable& other) { std::cout << " non trivial copy ctor\n"; } - int + binary_op_non_device_copyable(const binary_op_non_device_copyable& other) + { + std::cout << " non trivial copy ctor\n"; + } + int operator()(int a, int b) const { return a; @@ -89,7 +92,7 @@ struct binary_op_non_device_copyable struct binary_op_device_copyable { binary_op_device_copyable(const binary_op_device_copyable& other) { std::cout << " non trivial copy ctor\n"; } - int + int operator()(int a, int b) const { return a; From 892859017898a3a2661e33fdb55d4dad765f4afc Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:07:42 -0400 Subject: [PATCH 07/15] removing test changes Signed-off-by: Dan Hoeflinger --- .../algorithm/alg.modifying.operations/unique.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp index f2388ecef8d..8d8d8e6c20c 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/unique.pass.cpp @@ -40,7 +40,7 @@ struct run_unique ForwardIt k = unique(exec, first2, last2); auto n = ::std::distance(first1, i); - EXPECT_EQ(::std::distance(first2, k), n, "wrong return value from unique without predicate"); + EXPECT_TRUE(::std::distance(first2, k) == n, "wrong return value from unique without predicate"); EXPECT_EQ_N(first1, first2, n, "wrong effect from unique without predicate"); } }; @@ -63,7 +63,7 @@ struct run_unique_predicate ForwardIt k = unique(exec, first2, last2, pred); auto n = ::std::distance(first1, i); - EXPECT_EQ(::std::distance(first2, k), n, "wrong return value from unique with predicate"); + EXPECT_TRUE(::std::distance(first2, k) == n, "wrong return value from unique with predicate"); EXPECT_EQ_N(first1, first2, n, "wrong effect from unique with predicate"); } }; From 2d63d475d3d9ac3c7fa92be58be953c958ce4cc2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:25:48 -0400 Subject: [PATCH 08/15] consolidate helper for selecting reduce_then_scan Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 14 ++++---------- .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 10 ++++++++++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 1ddfd360749..2e714f77250 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -924,9 +924,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); - // Reduce-then-scan performs poorly on CPUs due to sub-group operations. - if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( @@ -1071,8 +1069,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t auto __n = __rng.size(); // choice between legacy and reduce_then_scan - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); - if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { return __parallel_reduce_then_scan_copy( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), @@ -1103,8 +1100,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen { auto __n = __rng.size(); // choice between legacy and reduce_then_scan - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); - if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, @@ -1146,7 +1142,6 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, constexpr ::std::uint16_t __single_group_upper_limit = 2048; std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); - const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size && __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size) @@ -1157,8 +1152,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign)); } - // Reduce-then-scan performs poorly on CPUs due to sub-group operations. - else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32) + else if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { using _ReduceOp = std::plus<_Size>; using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index f56d7704492..f6fef5f8434 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -728,6 +728,16 @@ struct __parallel_reduce_then_scan_scan_submitter< _InitType __init; }; +// reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup +// operations. We do not want to run this can on CPU targets, as they are not performant with this algorithm. +template +bool +__is_best_alg_reduce_then_scan(_ExecutionPolicy&& __exec) +{ + const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); + return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32); +} + // General scan-like algorithm helpers // _GenReduceInput - a function which accepts the input range and index to generate the data needed by the main output // used in the reduction operation (to calculate the global carries) From cb9ff3053269334c6e6fb6d2ffc84be2497e36b4 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:40:54 -0400 Subject: [PATCH 09/15] adding assert for empty scans Signed-off-by: Dan Hoeflinger --- .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index f6fef5f8434..b086d9f081f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -789,6 +789,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // skip scan of zeroth element in unique patterns __num_remaining -= 1; } + // reduce_then_scan kernel is not built to handle "empty" scans which includes `__n == 1` for unique patterns. + // These trivial end cases should be handled at a higher level. + assert(__num_remaining > 0); auto __inputs_per_sub_group = __num_remaining >= __max_inputs_per_block ? __max_inputs_per_block / __num_sub_groups_global From 129215cfc0d2656fb0eded485ff84897891cd7e7 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:47:22 -0400 Subject: [PATCH 10/15] ::std:: -> std:: Signed-off-by: Dan Hoeflinger --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 6 ++--- .../hetero/algorithm_ranges_impl_hetero.h | 2 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 24 +++++++++---------- test/support/utils_device_copyable.h | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index ddfc30046a0..300681a76da 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -939,9 +939,9 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e auto __buf2 = __keep2(__zipped_res, __zipped_res + __n); auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); - return ::std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get())); + return std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get())); } //------------------------------------------------------------------------ @@ -974,7 +974,7 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec auto __buf2 = __keep2(__result_first, __result_first + __n); auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); return __result_first + __result.get(); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 8ef423c7071..5c1c09c02d5 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -396,7 +396,7 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec { using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>; oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result)) .get(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2e714f77250..928ea137c92 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -966,10 +966,10 @@ struct __invoke_single_group_copy_if // Specialization for devices that have a max work-group size of at least 1024 static constexpr ::std::uint16_t __targeted_wg_size = 1024; - template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred, + template auto - operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred, + operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred, _Assign&& __assign) { constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size); @@ -983,8 +983,8 @@ struct __invoke_single_group_copy_if return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, true, oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, + __scan_copy_single_wg_kernel, + std::integral_constant, /* _IsFullGroup= */ std::true_type, _CustomName>>>()( __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); @@ -992,8 +992,8 @@ struct __invoke_single_group_copy_if return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, false, oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>, - ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, + __scan_copy_single_wg_kernel, + std::integral_constant, /* _IsFullGroup= */ std::false_type, _CustomName>>>()( __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred), std::forward<_Assign>(__assign)); @@ -1080,14 +1080,14 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t } else { - unseq_backend::__copy_by_mask<::std::plus, oneapi::dpl::__internal::__pstl_assign, - /*inclusive*/ ::std::true_type, 1> + unseq_backend::__copy_by_mask, oneapi::dpl::__internal::__pstl_assign, + /*inclusive*/ std::true_type, 1> __copy_by_mask_op; oneapi::dpl::__internal::__create_mask_unique_copy, decltype(__n)> __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; - return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op); } @@ -1110,12 +1110,12 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen } else { - using _ReduceOp = ::std::plus; + using _ReduceOp = std::plus; unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred}; - unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __partition_by_mask{_ReduceOp{}}; + unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type> __partition_by_mask{_ReduceOp{}}; - return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask); } diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h index 40fe1cf61cc..32e02991933 100644 --- a/test/support/utils_device_copyable.h +++ b/test/support/utils_device_copyable.h @@ -58,7 +58,7 @@ struct assign_non_device_copyable void operator()(const _Xp& __x, _Yp&& __y) const { - ::std::forward<_Yp>(__y) = __x; + std::forward<_Yp>(__y) = __x; } }; @@ -69,7 +69,7 @@ struct assign_device_copyable void operator()(const _Xp& __x, _Yp&& __y) const { - ::std::forward<_Yp>(__y) = __x; + std::forward<_Yp>(__y) = __x; } }; From e510c0d3c78e8c8b3cec73910250d2475098219c Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 15:50:51 -0400 Subject: [PATCH 11/15] address minor feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 928ea137c92..afce181f8ce 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -726,7 +726,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend auto __event = __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( - ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size); return __future(__event, __dummy_result_and_scratch); } @@ -858,7 +858,7 @@ struct __get_zeroth_element return std::get<0>(std::forward<_Tp>(__a)); } }; -template +template struct __write_to_idx_if { template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index b086d9f081f..574a38e2ca5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -732,7 +732,7 @@ struct __parallel_reduce_then_scan_scan_submitter< // operations. We do not want to run this can on CPU targets, as they are not performant with this algorithm. template bool -__is_best_alg_reduce_then_scan(_ExecutionPolicy&& __exec) +__is_best_alg_reduce_then_scan(const _ExecutionPolicy& __exec) { const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32); From b0ada7a61159249fc5a442dc536688ff8ac3a79f Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 16:09:08 -0400 Subject: [PATCH 12/15] removing unnecessary comment Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index afce181f8ce..76417ba208f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1068,7 +1068,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t { auto __n = __rng.size(); - // choice between legacy and reduce_then_scan if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { return __parallel_reduce_then_scan_copy( @@ -1099,7 +1098,6 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { auto __n = __rng.size(); - // choice between legacy and reduce_then_scan if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), From 13d31a122086fd17e1a0ba33f95d2cb09f3daade Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 16:34:27 -0400 Subject: [PATCH 13/15] switching copy_if to use common __parallel_reduce_then_scan_copy, and cleanup Signed-off-by: Dan Hoeflinger --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 76417ba208f..0139fbe4cff 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -148,8 +148,8 @@ struct iter_mode // for zip_iterator template auto - operator()(const oneapi::dpl::zip_iterator& it) - -> decltype(oneapi::dpl::__internal::map_zip(*this, it.base())) + operator()(const oneapi::dpl::zip_iterator& it) -> decltype(oneapi::dpl::__internal::map_zip(*this, + it.base())) { return oneapi::dpl::__internal::map_zip(*this, it.base()); } @@ -926,11 +926,16 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen } if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { - oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op}; + using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; + using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_idx; + + _GenInput __gen_transform{__unary_op}; + return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{}, + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); } } @@ -1007,14 +1012,15 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { + using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>; using _ReduceOp = std::plus<_Size>; + using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>; + using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; + return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), - oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>{__generate_mask}, _ReduceOp{}, - oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, - oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, __write_op, - oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, + std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask}, + _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } @@ -1070,12 +1076,13 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t auto __n = __rng.size(); if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { - return __parallel_reduce_then_scan_copy( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1>{std::forward<_Assign>(__assign)}, - /*_IsUniquePattern=*/std::true_type{}); + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1, _Assign>; + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)}, + /*_IsUniquePattern=*/std::true_type{}); } else { @@ -1086,9 +1093,8 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t decltype(__n)> __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - __create_mask_op, __copy_by_mask_op); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op); } } @@ -1100,11 +1106,13 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen auto __n = __rng.size(); if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; + using _WriteOp = + oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>{__pred}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else{}, - /*_IsUniquePattern=*/std::false_type{}); + _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); } else { @@ -1113,9 +1121,8 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred}; unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type> __partition_by_mask{_ReduceOp{}}; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - __create_mask_op, __partition_by_mask); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask); } } @@ -1152,19 +1159,12 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, } else if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) { - using _ReduceOp = std::plus<_Size>; using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; - _GenMask __generate_mask{__pred}; - - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), - oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>{__generate_mask}, _ReduceOp{}, - oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>{__generate_mask}, - oneapi::dpl::__par_backend_hetero::__get_zeroth_element{}, - oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>{std::forward<_Assign>(__assign)}, - oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, - /*_Inclusive=*/std::true_type{}, /*Unique=*/std::false_type{}); + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>; + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _GenMask{__pred}, _WriteOp{}, /*Unique=*/std::false_type{}); } else { @@ -1743,8 +1743,8 @@ struct __is_radix_sort_usable_for_type static constexpr bool value = #if _USE_RADIX_SORT (::std::is_arithmetic_v<_T> || ::std::is_same_v) && - (__internal::__is_comp_ascending<::std::decay_t<_Compare>>::value || - __internal::__is_comp_descending<::std::decay_t<_Compare>>::value); + (__internal::__is_comp_ascending<::std::decay_t<_Compare>>::value || + __internal::__is_comp_descending<::std::decay_t<_Compare>>::value); #else false; #endif From 85fb61c05be819416b9546dd652c6eaf90fb07e8 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 16:38:51 -0400 Subject: [PATCH 14/15] reverting formatting only changes Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 0139fbe4cff..bb3828d2b52 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -148,8 +148,8 @@ struct iter_mode // for zip_iterator template auto - operator()(const oneapi::dpl::zip_iterator& it) -> decltype(oneapi::dpl::__internal::map_zip(*this, - it.base())) + operator()(const oneapi::dpl::zip_iterator& it) + -> decltype(oneapi::dpl::__internal::map_zip(*this, it.base())) { return oneapi::dpl::__internal::map_zip(*this, it.base()); } @@ -1743,8 +1743,8 @@ struct __is_radix_sort_usable_for_type static constexpr bool value = #if _USE_RADIX_SORT (::std::is_arithmetic_v<_T> || ::std::is_same_v) && - (__internal::__is_comp_ascending<::std::decay_t<_Compare>>::value || - __internal::__is_comp_descending<::std::decay_t<_Compare>>::value); + (__internal::__is_comp_ascending<::std::decay_t<_Compare>>::value || + __internal::__is_comp_descending<::std::decay_t<_Compare>>::value); #else false; #endif From 7f2108c8681a2b17b732743fe99aefb050ac2621 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 2 Aug 2024 17:01:13 -0400 Subject: [PATCH 15/15] function name change Signed-off-by: Dan Hoeflinger --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 ++++---- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index bb3828d2b52..2231a80fcb7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -924,7 +924,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } - if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; using _ScanInputTransform = oneapi::dpl::__internal::__no_op; @@ -1074,7 +1074,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t { auto __n = __rng.size(); - if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1, _Assign>; @@ -1104,7 +1104,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { auto __n = __rng.size(); - if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) + if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = @@ -1157,7 +1157,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign)); } - else if (oneapi::dpl::__par_backend_hetero::__is_best_alg_reduce_then_scan(__exec)) + else if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 574a38e2ca5..8ac40cfa91a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -732,7 +732,7 @@ struct __parallel_reduce_then_scan_scan_submitter< // operations. We do not want to run this can on CPU targets, as they are not performant with this algorithm. template bool -__is_best_alg_reduce_then_scan(const _ExecutionPolicy& __exec) +__prefer_reduce_then_scan(const _ExecutionPolicy& __exec) { const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32); return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32);