oneapi-src · mmichel11 · Jul 31, 2024 · Jul 8, 2024 · Jul 9, 2024 · Jul 10, 2024
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -894,13 +894,17 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                     ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-        oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
-        return __parallel_transform_reduce_then_scan(
-                            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-                            std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
-                            oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
+        const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
+        // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
+        if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
+        {
+            oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
+            return __parallel_transform_reduce_then_scan(
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
+                oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
+        }
     }
-    else
     {
         using _Assigner = unseq_backend::__scan_assigner;
         using _NoAssign = unseq_backend::__scan_no_assign;
@@ -911,8 +915,13 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         _NoAssign __no_assign_op;
         _NoOpFunctor __get_data_op;
 
+        // Although we do not actually need result storage in this case, we need to construct
+        // a placeholder here to match the return type of reduce-then-scan
+        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
+        _TempStorage __dummy_result_and_scratch{__exec, 0};
+
         return
-            __parallel_transform_scan_base(
+            __future(__parallel_transform_scan_base(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
                 std::forward<_Range2>(__out_rng), __binary_op, __init,
                 // local scan
@@ -924,7 +933,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                                       _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
                     __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
                 // global scan
-                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
+                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(),
+                    __dummy_result_and_scratch);
     }
 }
 
@@ -1028,7 +1038,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 
     constexpr ::std::uint16_t __single_group_upper_limit = 2048;
 
-    ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
 
     if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
         __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
@@ -1039,9 +1050,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
             _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
             ::std::forward<_OutRng>(__out_rng), __pred);
     }
-    else
+    // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
+    else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
     {
-        using _ReduceOp = ::std::plus<_Size>;
+        using _ReduceOp = std::plus<_Size>;
 
         return __parallel_transform_reduce_then_scan(
             __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
@@ -1052,6 +1064,23 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
             oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
             /*_Inclusive=*/std::true_type{});
     }
+    else
+    {
+        using _ReduceOp = ::std::plus<_Size>;
+        using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+        using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
+                                                     /*inclusive*/ std::true_type, 1>;
+        // Although we do not actually need result storage in this case, we need to construct
+        // a placeholder here to match the return type of reduce-then-scan
+        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Size>;
+        _TempStorage __dummy_result_and_scratch{__exec, 0};
+
+        return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                             std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                             CreateOp{__pred}, CopyOp{})
+                            .event(),
+                        __dummy_result_and_scratch);
+    }
 }
 
 //------------------------------------------------------------------------

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <type_traits>
 #include <tuple>
+#include <algorithm>
 
 #include "../../iterator_impl.h"
 
@@ -769,6 +770,15 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
+template <typename _ExecutionPolicy, typename _SizeType>
+bool
+__supports_sub_group_size(const _ExecutionPolicy& __exec, _SizeType __target_size)
-__supports_sub_group_size(const _ExecutionPolicy& __exec, _SizeType __target_size)
+__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
-__supports_sub_group_size(const _ExecutionPolicy& __exec, _SizeType __target_size)
+__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
+{
+    const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
+    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast<std::size_t>(__target_size)) !=
+           __subgroup_sizes.end();
+}
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi