From d07ada28bb045b8830cdfe380c1f0c4654f7f2d3 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 8 Jul 2024 16:46:53 -0500
Subject: [PATCH 01/88] Checkpoint for reduce then scan integration

Test is currently compiling. None of the real device code has been
integrated, but the preliminary host code and general structure has been defined.

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 193 +++++++++++++++---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    |  57 ++++++
 2 files changed, 227 insertions(+), 23 deletions(-)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 76c6abb1b34..2b037ee1ab9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -213,6 +213,12 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
+template <typename... _Name>
+class __reduce_then_scan_reduce_kernel;
+
+template <typename... _Name>
+class __reduce_then_scan_scan_kernel;
+
 //------------------------------------------------------------------------
 // parallel_for - async pattern
 //------------------------------------------------------------------------
@@ -375,6 +381,60 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
     }
 };
 
+template <std::size_t __sub_group_size, typename _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter;
+
+template <std::size_t __sub_group_size, typename... _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size,
+                                                    __internal::__optional_kernel_name<_KernelName...>>
+{
+    // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
+    // input buffer
+    template <typename _ExecutionPolicy, typename _Range, typename _TmpStorageAcc, typename _SubGroupReduce>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _TmpStorageAcc __tmp_storage_acc,
+               const _SubGroupReduce& __sub_group_reduce, const sycl::event& __prior_event,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num, const bool __is_full_block) const
+    {
+        return __exec.queue().submit([&](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+            __cgh.depends_on(__prior_event);
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=](sycl::nd_item<1> __ndi) {
+                __sub_group_reduce(__ndi, __rng, __tmp_storage_acc, __inputs_per_sub_group, __inputs_per_item,
+                                   __block_num, __is_full_block);
+            });
+        });
+    }
+    const sycl::nd_range<1> __nd_range;
+};
+
+template <std::size_t __sub_group_size, typename _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter;
+
+template <std::size_t __sub_group_size, typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __internal::__optional_kernel_name<_KernelName...>>
+{
+    template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _TmpStorageAcc,
+              typename _SubGroupCarryAndScan>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _TmpStorageAcc __tmp_storage_acc,
+               const _SubGroupCarryAndScan& __sub_group_carry_and_scan, const sycl::event& __prior_event,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num, const bool __is_full_block) const
+    {
+        return __exec.queue().submit([&](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+            __cgh.depends_on(__prior_event);
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=](sycl::nd_item<1> __ndi) {
+                __sub_group_carry_and_scan(__ndi, __rng1, __rng2, __tmp_storage_acc, __inputs_per_sub_group,
+                                           __inputs_per_item, __block_num, __is_full_block);
+            });
+        });
+    }
+    const sycl::nd_range<1> __nd_range;
+};
+
 template <typename _ValueType, bool _Inclusive, typename _Group, typename _Begin, typename _End, typename _OutIt,
           typename _BinaryOperation>
 void
@@ -751,6 +811,100 @@ __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _E
         __binary_op, __init, __local_scan, __group_scan, __global_scan);
 }
 
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation,
+          typename _UnaryOperation, typename _InitType, typename _Inclusive>
+auto
+__parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
+                                      _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op,
+                                      _UnaryOperation __unary_op,
+                                      _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_reduce_kernel<_CustomName>>;
+    using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_scan_kernel<_CustomName>>;
+    using _ValueType = typename _InitType::__value_type;
+
+    constexpr std::size_t __sub_group_size = 32;
+    // Empirically determined maximum. May be less for non-full blocks.
+    constexpr std::uint8_t __max_inputs_per_item = 128;
+    constexpr bool __inclusive = _Inclusive::value;
+
+    // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
+    // that no check is needed?
+    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+
+    // TODO: base on max compute units. Recall disconnect in vendor definitions (# SMs vs. # XVEs)
+    const std::size_t __num_work_groups = 128;
+    const std::size_t __num_work_items = __num_work_groups * __work_group_size;
+    const std::size_t __num_sub_groups_local = __work_group_size / __sub_group_size;
+    const std::size_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
+    const std::size_t __n = __in_rng.size();
+    const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
+    std::size_t __num_remaining = __n;
+    auto __inputs_per_sub_group =
+        __n >= __max_inputs_per_block
+            ? __max_inputs_per_block / __num_sub_groups_global
+            : std::max(__sub_group_size,
+                       oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+    auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
+    const auto __global_range = sycl::range<1>(__num_work_items);
+    const auto __local_range = sycl::range<1>(__work_group_size);
+    const auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
+    const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
+    const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
+
+    // TODO: Use the trick in reduce to wrap in a shared_ptr with custom deleter to support asynchronous frees.
+    _ValueType* __tmp_storage = sycl::malloc_device<_ValueType>(__num_sub_groups_global + 1, __exec.queue());
+
+    // Kernel submitters to build event dependency chain
+    __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, _ReduceKernel> __reduce_submitter{__kernel_nd_range};
+    __parallel_reduce_then_scan_scan_submitter<__sub_group_size, _ScanKernel> __scan_submitter{__kernel_nd_range};
+
+    // Reduce and scan step implementations
+    using _ReduceImpl = unseq_backend::__sub_group_reduce<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                          _BinaryOperation, _UnaryOperation, _InitType>;
+    using _ScanImpl = unseq_backend::__sub_group_carry_and_scan<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                                _BinaryOperation, _UnaryOperation, _InitType>;
+
+    _ReduceImpl __reduce_step{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, __num_work_items,
+                              __n, __binary_op, __unary_op, __init};
+    _ScanImpl __scan_step{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, __num_work_items,
+                          __n, __binary_op, __unary_op, __init};
+
+    sycl::event __event;
+    // Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
+    // with sufficiently large L2 / L3 caches.
+    for (std::size_t __b = 0; __b < __num_blocks; ++__b)
+    {
+        bool __is_full_block = __inputs_per_item == __max_inputs_per_item;
+        // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
+        __event = __reduce_submitter(__exec, __in_rng, __tmp_storage, __reduce_step, __event, __inputs_per_sub_group,
+                                     __inputs_per_item, __b, __is_full_block);
+        // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
+        __event = __scan_submitter(__exec, __in_rng, __out_rng, __tmp_storage, __scan_step, __event,
+                                   __inputs_per_sub_group, __inputs_per_item, __b, __is_full_block);
+        if (__num_remaining > __block_size)
+        {
+            // Resize for the next block.
+            __num_remaining -= __block_size;
+            // TODO: This recalculation really only matters for the second to last iteration
+            // of the loop since the last iteration is the only non-full block.
+            __inputs_per_sub_group =
+                __num_remaining >= __max_inputs_per_block
+                    ? __max_inputs_per_block / __num_sub_groups_global
+                    : std::max(__sub_group_size,
+                               oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+            __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
+        }
+    }
+    // TODO: Remove to make asynchronous. Depends on completing async USM free TODO.
+    __event.wait();
+    sycl::free(__tmp_storage, __exec.queue());
+    return __future(__event);
+}
+
 template <typename _Type>
 bool
 __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::size_t __n_uniform)
@@ -779,6 +933,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     if ((__n_uniform & (__n_uniform - 1)) != 0)
         __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;
 
+    // TODO: can we reimplement this with support fort non-identities as well? We can then use in reduce-then-scan
+    // for the last block if it is sufficiently small
     constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
     if constexpr (__can_use_group_scan)
     {
@@ -790,29 +946,20 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         }
     }
 
-    // Either we can't use group scan or this input is too big for one workgroup
-    using _Assigner = unseq_backend::__scan_assigner;
-    using _NoAssign = unseq_backend::__scan_no_assign;
-    using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
-    using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
-
-    _Assigner __assign_op;
-    _NoAssign __no_assign_op;
-    _NoOpFunctor __get_data_op;
-
-    return __parallel_transform_scan_base(
-        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-        std::forward<_Range2>(__out_rng), __binary_op, __init,
-        // local scan
-        unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
-                              _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
-                                                       __get_data_op},
-        // scan between groups
-        unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor, _NoAssign,
-                              _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-            __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-        // global scan
-        unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
+    // TODO: Reintegrate once support has been added
+    //// Either we can't use group scan or this input is too big for one workgroup
+    //using _Assigner = unseq_backend::__scan_assigner;
+    //using _NoAssign = unseq_backend::__scan_no_assign;
+    //using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
+    //using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
+
+    //_Assigner __assign_op;
+    //_NoAssign __no_assign_op;
+    //_NoOpFunctor __get_data_op;
+    return __future(__parallel_transform_reduce_then_scan(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
+                                                          ::std::forward<_Range1>(__in_rng), ::std::forward<_Range2>(__out_rng),
+                                                          __binary_op, __unary_op, __init, _Inclusive{})
+                    .event());
 }
 
 template <typename _SizeType>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 5bb2735890c..93f51710300 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1105,6 +1105,63 @@ struct __brick_reduce_idx
     _Size __n;
 };
 
+// Kernel 1 body of reduce-then-scan
+// Reduces the input per work-group and stores sub-group carry outs
+template <std::uint32_t __sub_group_size, std::uint32_t __max_inputs_per_item, bool _Inclusive, typename _BinaryOp,
+          typename _UnaryOp, typename _WrappedInitType>
+struct __sub_group_reduce
+{
+    // TODO: Implement this operator.
+    template <typename _ItemId, typename _InAcc, typename _CarryOutAcc>
+    void
+    operator()(const _ItemId __ndi, _InAcc __input, _CarryOutAcc __global_carry_out,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num, const bool __is_full_block) const
+    {
+    }
+
+    const std::size_t __block_size;
+    const std::size_t __num_sub_groups_local;
+    const std::size_t __num_sub_groups_global;
+    const std::size_t __num_work_items;
+    const std::size_t __n;
+
+    const _BinaryOp __binary_op;
+    const _UnaryOp __unary_op;
+    const _WrappedInitType __wrapped_init;
+
+    // TODO: Add the mask functors here to generalize for scan-based algorithms
+};
+
+// Kernel 2 body of reduce-then-scan
+// Determines inter and intra work-group carry ins and stores the full scan results to output buffer
+// for an input block
+template <std::uint32_t __sub_group_size, std::uint32_t __max_inputs_per_item, bool _Inclusive, typename _BinaryOp,
+          typename _UnaryOp, typename _WrappedInitType>
+struct __sub_group_carry_and_scan
+{
+    // TODO: Implement this operator
+    template <typename _ItemId, typename _InAcc, typename _OutAcc, typename _CarryInAcc>
+    void
+    operator()(const _ItemId __ndi, _InAcc __input, _OutAcc __output, _CarryInAcc __global_carry_in,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num, const bool __is_full_block) const
+    {
+    }
+
+    const std::size_t __block_size;
+    const std::size_t __num_sub_groups_local;
+    const std::size_t __num_sub_groups_global;
+    const std::size_t __num_work_items;
+    const std::size_t __n;
+
+    const _BinaryOp __binary_op;
+    const _UnaryOp __unary_op;
+    const _WrappedInitType __wrapped_init;
+
+    // TODO: Add the mask functors here to generalize for scan-based algorithms
+};
+
 } // namespace unseq_backend
 } // namespace dpl
 } // namespace oneapi

From 62442662ab91cb7c5961b684935cf9766d8e6b58 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 9 Jul 2024 17:22:03 -0500
Subject: [PATCH 02/88] Introduce a parallel_backend_sycl_reduce_then_scan.h
 file to contain implementation

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 155 +-----------
 .../parallel_backend_sycl_reduce_then_scan.h  | 225 ++++++++++++++++++
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    |  57 -----
 3 files changed, 226 insertions(+), 211 deletions(-)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 2b037ee1ab9..b40a147bbb4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -38,6 +38,7 @@
 #include "parallel_backend_sycl_reduce.h"
 #include "parallel_backend_sycl_merge.h"
 #include "parallel_backend_sycl_merge_sort.h"
+#include "parallel_backend_sycl_reduce_then_scan.h"
 #include "execution_sycl_defs.h"
 #include "sycl_iterator.h"
 #include "unseq_backend_sycl.h"
@@ -213,12 +214,6 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
-template <typename... _Name>
-class __reduce_then_scan_reduce_kernel;
-
-template <typename... _Name>
-class __reduce_then_scan_scan_kernel;
-
 //------------------------------------------------------------------------
 // parallel_for - async pattern
 //------------------------------------------------------------------------
@@ -381,60 +376,6 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
     }
 };
 
-template <std::size_t __sub_group_size, typename _KernelName>
-struct __parallel_reduce_then_scan_reduce_submitter;
-
-template <std::size_t __sub_group_size, typename... _KernelName>
-struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size,
-                                                    __internal::__optional_kernel_name<_KernelName...>>
-{
-    // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
-    // input buffer
-    template <typename _ExecutionPolicy, typename _Range, typename _TmpStorageAcc, typename _SubGroupReduce>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _TmpStorageAcc __tmp_storage_acc,
-               const _SubGroupReduce& __sub_group_reduce, const sycl::event& __prior_event,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
-               const std::size_t __block_num, const bool __is_full_block) const
-    {
-        return __exec.queue().submit([&](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
-            __cgh.depends_on(__prior_event);
-            oneapi::dpl::__ranges::__require_access(__cgh, __rng);
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=](sycl::nd_item<1> __ndi) {
-                __sub_group_reduce(__ndi, __rng, __tmp_storage_acc, __inputs_per_sub_group, __inputs_per_item,
-                                   __block_num, __is_full_block);
-            });
-        });
-    }
-    const sycl::nd_range<1> __nd_range;
-};
-
-template <std::size_t __sub_group_size, typename _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter;
-
-template <std::size_t __sub_group_size, typename... _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __internal::__optional_kernel_name<_KernelName...>>
-{
-    template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _TmpStorageAcc,
-              typename _SubGroupCarryAndScan>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _TmpStorageAcc __tmp_storage_acc,
-               const _SubGroupCarryAndScan& __sub_group_carry_and_scan, const sycl::event& __prior_event,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
-               const std::size_t __block_num, const bool __is_full_block) const
-    {
-        return __exec.queue().submit([&](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
-            __cgh.depends_on(__prior_event);
-            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2);
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=](sycl::nd_item<1> __ndi) {
-                __sub_group_carry_and_scan(__ndi, __rng1, __rng2, __tmp_storage_acc, __inputs_per_sub_group,
-                                           __inputs_per_item, __block_num, __is_full_block);
-            });
-        });
-    }
-    const sycl::nd_range<1> __nd_range;
-};
-
 template <typename _ValueType, bool _Inclusive, typename _Group, typename _Begin, typename _End, typename _OutIt,
           typename _BinaryOperation>
 void
@@ -811,100 +752,6 @@ __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _E
         __binary_op, __init, __local_scan, __group_scan, __global_scan);
 }
 
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation,
-          typename _UnaryOperation, typename _InitType, typename _Inclusive>
-auto
-__parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
-                                      _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op,
-                                      _UnaryOperation __unary_op,
-                                      _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
-{
-    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __reduce_then_scan_reduce_kernel<_CustomName>>;
-    using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __reduce_then_scan_scan_kernel<_CustomName>>;
-    using _ValueType = typename _InitType::__value_type;
-
-    constexpr std::size_t __sub_group_size = 32;
-    // Empirically determined maximum. May be less for non-full blocks.
-    constexpr std::uint8_t __max_inputs_per_item = 128;
-    constexpr bool __inclusive = _Inclusive::value;
-
-    // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
-    // that no check is needed?
-    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
-
-    // TODO: base on max compute units. Recall disconnect in vendor definitions (# SMs vs. # XVEs)
-    const std::size_t __num_work_groups = 128;
-    const std::size_t __num_work_items = __num_work_groups * __work_group_size;
-    const std::size_t __num_sub_groups_local = __work_group_size / __sub_group_size;
-    const std::size_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
-    const std::size_t __n = __in_rng.size();
-    const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
-    std::size_t __num_remaining = __n;
-    auto __inputs_per_sub_group =
-        __n >= __max_inputs_per_block
-            ? __max_inputs_per_block / __num_sub_groups_global
-            : std::max(__sub_group_size,
-                       oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
-    auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const auto __global_range = sycl::range<1>(__num_work_items);
-    const auto __local_range = sycl::range<1>(__work_group_size);
-    const auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
-    const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
-    const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
-
-    // TODO: Use the trick in reduce to wrap in a shared_ptr with custom deleter to support asynchronous frees.
-    _ValueType* __tmp_storage = sycl::malloc_device<_ValueType>(__num_sub_groups_global + 1, __exec.queue());
-
-    // Kernel submitters to build event dependency chain
-    __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, _ReduceKernel> __reduce_submitter{__kernel_nd_range};
-    __parallel_reduce_then_scan_scan_submitter<__sub_group_size, _ScanKernel> __scan_submitter{__kernel_nd_range};
-
-    // Reduce and scan step implementations
-    using _ReduceImpl = unseq_backend::__sub_group_reduce<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                          _BinaryOperation, _UnaryOperation, _InitType>;
-    using _ScanImpl = unseq_backend::__sub_group_carry_and_scan<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                                _BinaryOperation, _UnaryOperation, _InitType>;
-
-    _ReduceImpl __reduce_step{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, __num_work_items,
-                              __n, __binary_op, __unary_op, __init};
-    _ScanImpl __scan_step{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, __num_work_items,
-                          __n, __binary_op, __unary_op, __init};
-
-    sycl::event __event;
-    // Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
-    // with sufficiently large L2 / L3 caches.
-    for (std::size_t __b = 0; __b < __num_blocks; ++__b)
-    {
-        bool __is_full_block = __inputs_per_item == __max_inputs_per_item;
-        // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
-        __event = __reduce_submitter(__exec, __in_rng, __tmp_storage, __reduce_step, __event, __inputs_per_sub_group,
-                                     __inputs_per_item, __b, __is_full_block);
-        // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
-        __event = __scan_submitter(__exec, __in_rng, __out_rng, __tmp_storage, __scan_step, __event,
-                                   __inputs_per_sub_group, __inputs_per_item, __b, __is_full_block);
-        if (__num_remaining > __block_size)
-        {
-            // Resize for the next block.
-            __num_remaining -= __block_size;
-            // TODO: This recalculation really only matters for the second to last iteration
-            // of the loop since the last iteration is the only non-full block.
-            __inputs_per_sub_group =
-                __num_remaining >= __max_inputs_per_block
-                    ? __max_inputs_per_block / __num_sub_groups_global
-                    : std::max(__sub_group_size,
-                               oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
-            __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-        }
-    }
-    // TODO: Remove to make asynchronous. Depends on completing async USM free TODO.
-    __event.wait();
-    sycl::free(__tmp_storage, __exec.queue());
-    return __future(__event);
-}
-
 template <typename _Type>
 bool
 __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::size_t __n_uniform)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
new file mode 100644
index 00000000000..9dda2410ac2
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -0,0 +1,225 @@
+// -*- C++ -*-
+//===-- parallel_backend_sycl_reduce_then_scan.h ---------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
+#define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+#include "sycl_defs.h"
+#include "parallel_backend_sycl_utils.h"
+#include "execution_sycl_defs.h"
+#include "unseq_backend_sycl.h"
+#include "utils_ranges_sycl.h"
+
+#include "../../utils.h"
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+// TODO: Scan related specific utilities will be placed here
+
+template <typename... _Name>
+class __reduce_then_scan_reduce_kernel;
+
+template <typename... _Name>
+class __reduce_then_scan_scan_kernel;
+
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
+          typename _UnaryOperation, typename _WrappedInitType, typename _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter;
+
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
+          typename _UnaryOperation, typename _WrappedInitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                    _BinaryOperation, _UnaryOperation, _WrappedInitType,
+                                                    __internal::__optional_kernel_name<_KernelName...>>
+{
+    // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
+    // input buffer
+    template <typename _ExecutionPolicy, typename _Range, typename _TmpStorageAcc>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _TmpStorageAcc __tmp_storage_acc,
+               const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
+               const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
+    {
+        return __exec.queue().submit([&, this](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+            __cgh.depends_on(__prior_event);
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=, *this](sycl::nd_item<1> __ndi) {
+                // TODO: Add kernel body
+            });
+        });
+    }
+    const sycl::nd_range<1> __nd_range;
+
+    const std::size_t __max_block_size;
+    const std::size_t __num_sub_groups_local;
+    const std::size_t __num_sub_groups_global;
+    const std::size_t __num_work_items;
+    const std::size_t __n;
+
+    const _BinaryOperation __binary_op;
+    const _UnaryOperation __unary_op;
+    const _WrappedInitType __wrapped_init;
+
+    // TODO: Add the mask functors here to generalize for scan-based algorithms
+};
+
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
+          typename _UnaryOperation, typename _WrappedInitType, typename _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter;
+
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
+          typename _UnaryOperation, typename _WrappedInitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                  _BinaryOperation, _UnaryOperation, _WrappedInitType,
+                                                  __internal::__optional_kernel_name<_KernelName...>>
+{
+    template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _TmpStorageAcc>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _TmpStorageAcc __tmp_storage_acc,
+               const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
+               const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
+    {
+        return __exec.queue().submit([&, this](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+            __cgh.depends_on(__prior_event);
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=, *this](sycl::nd_item<1> __ndi) {
+                // TODO: Add kernel body
+            });
+        });
+    }
+
+    const sycl::nd_range<1> __nd_range;
+
+    const std::size_t __max_block_size;
+    const std::size_t __num_sub_groups_local;
+    const std::size_t __num_sub_groups_global;
+    const std::size_t __num_work_items;
+    const std::size_t __n;
+
+    const _BinaryOperation __binary_op;
+    const _UnaryOperation __unary_op;
+    const _WrappedInitType __wrapped_init;
+
+    // TODO: Add the mask functors here to generalize for scan-based algorithms
+};
+
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation,
+          typename _UnaryOperation, typename _InitType, typename _Inclusive>
+auto
+__parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
+                                      _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op,
+                                      _UnaryOperation __unary_op,
+                                      _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_reduce_kernel<_CustomName>>;
+    using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_scan_kernel<_CustomName>>;
+    using _ValueType = typename _InitType::__value_type;
+
+    constexpr std::size_t __sub_group_size = 32;
+    // Empirically determined maximum. May be less for non-full blocks.
+    constexpr std::uint8_t __max_inputs_per_item = 128;
+    constexpr bool __inclusive = _Inclusive::value;
+
+    // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
+    // that no check is needed?
+    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+
+    // TODO: base on max compute units. Recall disconnect in vendor definitions (# SMs vs. # XVEs)
+    const std::size_t __num_work_groups = 128;
+    const std::size_t __num_work_items = __num_work_groups * __work_group_size;
+    const std::size_t __num_sub_groups_local = __work_group_size / __sub_group_size;
+    const std::size_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
+    const std::size_t __n = __in_rng.size();
+    const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
+    std::size_t __num_remaining = __n;
+    auto __inputs_per_sub_group =
+        __n >= __max_inputs_per_block
+            ? __max_inputs_per_block / __num_sub_groups_global
+            : std::max(__sub_group_size,
+                       oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+    auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
+    const auto __global_range = sycl::range<1>(__num_work_items);
+    const auto __local_range = sycl::range<1>(__work_group_size);
+    const auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
+    const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
+    const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
+
+    // TODO: Use the trick in reduce to wrap in a shared_ptr with custom deleter to support asynchronous frees.
+    _ValueType* __tmp_storage = sycl::malloc_device<_ValueType>(__num_sub_groups_global + 1, __exec.queue());
+
+    // Reduce and scan step implementations
+    using _ReduceSubmitter =
+        __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                     _BinaryOperation, _UnaryOperation, _InitType, _ReduceKernel>;
+    using _ScanSubmitter =
+        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                   _BinaryOperation, _UnaryOperation, _InitType, _ScanKernel>;
+    // TODO: remove below before merging. used for convenience now
+    // clang-format off
+    _ReduceSubmitter __reduce_submitter{__kernel_nd_range, __max_inputs_per_block, __num_sub_groups_local,
+        __num_sub_groups_global, __num_work_items, __n, __binary_op, __unary_op, __init};
+    _ScanSubmitter __scan_submitter{__kernel_nd_range, __max_inputs_per_block, __num_sub_groups_local, 
+        __num_sub_groups_global, __num_work_items, __n, __binary_op, __unary_op, __init};
+    // clang-format on
+
+    sycl::event __event;
+    // Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
+    // with sufficiently large L2 / L3 caches.
+    for (std::size_t __b = 0; __b < __num_blocks; ++__b)
+    {
+        bool __is_full_block = __inputs_per_item == __max_inputs_per_item;
+        // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
+        __event = __reduce_submitter(__exec, __in_rng, __tmp_storage, __event, __inputs_per_sub_group,
+                                     __inputs_per_item, __b, __is_full_block);
+        // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
+        __event = __scan_submitter(__exec, __in_rng, __out_rng, __tmp_storage, __event, __inputs_per_sub_group,
+                                   __inputs_per_item, __b, __is_full_block);
+        if (__num_remaining > __block_size)
+        {
+            // Resize for the next block.
+            __num_remaining -= __block_size;
+            // TODO: This recalculation really only matters for the second to last iteration
+            // of the loop since the last iteration is the only non-full block.
+            __inputs_per_sub_group =
+                __num_remaining >= __max_inputs_per_block
+                    ? __max_inputs_per_block / __num_sub_groups_global
+                    : std::max(__sub_group_size,
+                               oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+            __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
+        }
+    }
+    // TODO: Remove to make asynchronous. Depends on completing async USM free TODO.
+    __event.wait();
+    sycl::free(__tmp_storage, __exec.queue());
+    return __future(__event);
+}
+
+} // namespace __par_backend_hetero
+} // namespace dpl
+} // namespace oneapi
+
+#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
\ No newline at end of file
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 93f51710300..5bb2735890c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1105,63 +1105,6 @@ struct __brick_reduce_idx
     _Size __n;
 };
 
-// Kernel 1 body of reduce-then-scan
-// Reduces the input per work-group and stores sub-group carry outs
-template <std::uint32_t __sub_group_size, std::uint32_t __max_inputs_per_item, bool _Inclusive, typename _BinaryOp,
-          typename _UnaryOp, typename _WrappedInitType>
-struct __sub_group_reduce
-{
-    // TODO: Implement this operator.
-    template <typename _ItemId, typename _InAcc, typename _CarryOutAcc>
-    void
-    operator()(const _ItemId __ndi, _InAcc __input, _CarryOutAcc __global_carry_out,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
-               const std::size_t __block_num, const bool __is_full_block) const
-    {
-    }
-
-    const std::size_t __block_size;
-    const std::size_t __num_sub_groups_local;
-    const std::size_t __num_sub_groups_global;
-    const std::size_t __num_work_items;
-    const std::size_t __n;
-
-    const _BinaryOp __binary_op;
-    const _UnaryOp __unary_op;
-    const _WrappedInitType __wrapped_init;
-
-    // TODO: Add the mask functors here to generalize for scan-based algorithms
-};
-
-// Kernel 2 body of reduce-then-scan
-// Determines inter and intra work-group carry ins and stores the full scan results to output buffer
-// for an input block
-template <std::uint32_t __sub_group_size, std::uint32_t __max_inputs_per_item, bool _Inclusive, typename _BinaryOp,
-          typename _UnaryOp, typename _WrappedInitType>
-struct __sub_group_carry_and_scan
-{
-    // TODO: Implement this operator
-    template <typename _ItemId, typename _InAcc, typename _OutAcc, typename _CarryInAcc>
-    void
-    operator()(const _ItemId __ndi, _InAcc __input, _OutAcc __output, _CarryInAcc __global_carry_in,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
-               const std::size_t __block_num, const bool __is_full_block) const
-    {
-    }
-
-    const std::size_t __block_size;
-    const std::size_t __num_sub_groups_local;
-    const std::size_t __num_sub_groups_global;
-    const std::size_t __num_work_items;
-    const std::size_t __n;
-
-    const _BinaryOp __binary_op;
-    const _UnaryOp __unary_op;
-    const _WrappedInitType __wrapped_init;
-
-    // TODO: Add the mask functors here to generalize for scan-based algorithms
-};
-
 } // namespace unseq_backend
 } // namespace dpl
 } // namespace oneapi

From ccdb3b0a5c2e5926f03d368a742009e616d73d0e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 10 Jul 2024 17:44:02 -0500
Subject: [PATCH 03/88] Port of kernels from two-pass scan KT branch

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 635 +++++++++++++++++-
 1 file changed, 606 insertions(+), 29 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 9dda2410ac2..f0406e9e5e8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -35,7 +35,226 @@ namespace dpl
 namespace __par_backend_hetero
 {
 
-// TODO: Scan related specific utilities will be placed here
+template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
+          typename _SubGroup, typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+void
+__exclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+                                  _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
+{
+    std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
+    _ONEDPL_PRAGMA_UNROLL
+    for (std::uint8_t __shift = 1; __shift <= __sub_group_size / 2; __shift <<= 1)
+    {
+        auto __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
+        if (__mask_fn(__sub_group_local_id, __shift))
+        {
+            __value = __binary_op(__partial_carry_in, __value);
+        }
+    }
+    _LazyValueType __old_init;
+    if constexpr (__init_present)
+    {
+        __value = __binary_op(__init_and_carry.__v, __value);
+        if (__sub_group_local_id == 0)
+            __old_init.__setup(__init_and_carry.__v);
+        __init_and_carry.__v = sycl::group_broadcast(__sub_group, __value, __init_broadcast_id);
+    }
+    else
+    {
+        __init_and_carry.__setup(sycl::group_broadcast(__sub_group, __value, __init_broadcast_id));
+    }
+
+    __value = sycl::shift_group_right(__sub_group, __value, 1);
+    if constexpr (__init_present)
+    {
+        if (__sub_group_local_id == 0)
+        {
+            __value = __old_init.__v;
+            __old_init.__destroy();
+        }
+    }
+    //return by reference __value and __init_and_carry
+}
+
+template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
+          typename _SubGroup, typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+void
+__inclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+                                  _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
+{
+    std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
+    _ONEDPL_PRAGMA_UNROLL
+    for (std::uint8_t __shift = 1; __shift <= __sub_group_size / 2; __shift <<= 1)
+    {
+        auto __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
+        if (__mask_fn(__sub_group_local_id, __shift))
+        {
+            __value = __binary_op(__partial_carry_in, __value);
+        }
+    }
+    if constexpr (__init_present)
+    {
+        __value = __binary_op(__init_and_carry.__v, __value);
+        __init_and_carry.__v = sycl::group_broadcast(__sub_group, __value, __init_broadcast_id);
+    }
+    else
+    {
+        __init_and_carry.__setup(sycl::group_broadcast(__sub_group, __value, __init_broadcast_id));
+    }
+
+    //return by reference __value and __init_and_carry
+}
+
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
+          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+void
+__sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
+                 _LazyValueType& __init_and_carry)
+{
+    auto __mask_fn = [](auto __sub_group_local_id, auto __offset) { return __sub_group_local_id >= __offset; };
+    constexpr auto __init_broadcast_id = __sub_group_size - 1;
+    if constexpr (__is_inclusive)
+    {
+        __inclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
+                                                                            __value, __binary_op, __init_and_carry);
+    }
+    else
+    {
+        __exclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
+                                                                            __value, __binary_op, __init_and_carry);
+    }
+}
+
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
+          typename _BinaryOp, typename _ValueType, typename _LazyValueType, typename _SizeType>
+void
+__sub_group_scan_partial(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
+                         _LazyValueType& __init_and_carry, _SizeType __elements_to_process)
+{
+    auto __mask_fn = [__elements_to_process](auto __sub_group_local_id, auto __offset) {
+        return __sub_group_local_id >= __offset && __sub_group_local_id < __elements_to_process;
+    };
+    auto __init_broadcast_id = __elements_to_process - 1;
+    if constexpr (__is_inclusive)
+    {
+        __inclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
+                                                                            __value, __binary_op, __init_and_carry);
+    }
+    else
+    {
+        __exclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
+                                                                            __value, __binary_op, __init_and_carry);
+    }
+}
+
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
+          std::uint32_t __max_inputs_per_item, typename _SubGroup, typename _UnaryOp, typename _BinaryOp,
+          typename _LazyValueType, typename _InRng, typename _OutRng>
+void
+__scan_through_elements_helper(const _SubGroup& __sub_group, _UnaryOp __unary_op, _BinaryOp __binary_op,
+                               _LazyValueType& __sub_group_carry, _InRng __in_rng, _OutRng __out_rng,
+                               std::size_t __start_idx, std::size_t __n, std::uint32_t __iters_per_item,
+                               std::size_t __subgroup_start_idx, std::uint32_t __sub_group_id,
+                               std::uint32_t __active_subgroups)
+{
+    bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
+    bool __is_full_thread = __subgroup_start_idx + __iters_per_item * __sub_group_size <= __n;
+    if (__is_full_thread && __is_full_block)
+    {
+        auto __v = __unary_op(__in_rng[__start_idx]);
+        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
+                                                                           __sub_group_carry);
+        if constexpr (__capture_output)
+        {
+            __out_rng[__start_idx] = __v;
+        }
+
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
+        {
+            __v = __unary_op(__in_rng[__start_idx + __j * __sub_group_size]);
+            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(__sub_group, __v, __binary_op,
+                                                                                        __sub_group_carry);
+            if constexpr (__capture_output)
+            {
+                __out_rng[__start_idx + __j * __sub_group_size] = __v;
+            }
+        }
+    }
+    else if (__is_full_thread)
+    {
+        auto __v = __unary_op(__in_rng[__start_idx]);
+        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
+                                                                           __sub_group_carry);
+        if constexpr (__capture_output)
+        {
+            __out_rng[__start_idx] = __v;
+        }
+        for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
+        {
+            __v = __unary_op(__in_rng[__start_idx + __j * __sub_group_size]);
+            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(__sub_group, __v, __binary_op,
+                                                                                        __sub_group_carry);
+            if constexpr (__capture_output)
+            {
+                __out_rng[__start_idx + __j * __sub_group_size] = __v;
+            }
+        }
+    }
+    else
+    {
+        if (__sub_group_id < __active_subgroups)
+        {
+            auto __iters = oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_idx, __sub_group_size);
+
+            if (__iters == 1)
+            {
+                auto __v = __unary_op(__in_rng[__start_idx]);
+                __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
+                    __sub_group, __v, __binary_op, __sub_group_carry, __n - __subgroup_start_idx);
+                if constexpr (__capture_output)
+                {
+                    if (__start_idx < __n)
+                        __out_rng[__start_idx] = __v;
+                }
+            }
+            else
+            {
+                auto __v = __unary_op(__in_rng[__start_idx]);
+                __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
+                                                                                   __sub_group_carry);
+                if constexpr (__capture_output)
+                {
+                    __out_rng[__start_idx] = __v;
+                }
+
+                for (int __j = 1; __j < __iters - 1; __j++)
+                {
+                    auto __local_idx = __start_idx + __j * __sub_group_size;
+                    __v = __unary_op(__in_rng[__local_idx]);
+                    __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                        __sub_group, __v, __binary_op, __sub_group_carry);
+                    if constexpr (__capture_output)
+                    {
+                        __out_rng[__local_idx] = __v;
+                    }
+                }
+
+                auto __offset = __start_idx + (__iters - 1) * __sub_group_size;
+                auto __local_idx = (__offset < __n) ? __offset : __n - 1;
+                __v = __unary_op(__in_rng[__local_idx]);
+                __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                    __sub_group, __v, __binary_op, __sub_group_carry,
+                    __n - (__subgroup_start_idx + (__iters - 1) * __sub_group_size));
+                if constexpr (__capture_output)
+                {
+                    if (__offset < __n)
+                        __out_rng[__offset] = __v;
+                }
+            }
+        }
+    }
+}
 
 template <typename... _Name>
 class __reduce_then_scan_reduce_kernel;
@@ -43,32 +262,128 @@ class __reduce_then_scan_reduce_kernel;
 template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
-          typename _UnaryOperation, typename _WrappedInitType, typename _KernelName>
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
-          typename _UnaryOperation, typename _WrappedInitType, typename... _KernelName>
-struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                    _BinaryOperation, _UnaryOperation, _WrappedInitType,
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
+                                                    _BinaryOperation, _UnaryOperation, _InitType,
                                                     __internal::__optional_kernel_name<_KernelName...>>
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
-    template <typename _ExecutionPolicy, typename _Range, typename _TmpStorageAcc>
+    template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
     auto
-    operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _TmpStorageAcc __tmp_storage_acc,
+    operator()(_ExecutionPolicy&& __exec, _InRng&& __in_rng, _TmpStorageAcc __tmp_storage,
                const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
                const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
     {
-        return __exec.queue().submit([&, this](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+        using _InValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        return __exec.queue().submit([&, this](sycl::handler& __cgh) {
+            sycl::local_accessor<_InValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
             __cgh.depends_on(__prior_event);
-            oneapi::dpl::__ranges::__require_access(__cgh, __rng);
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=, *this](sycl::nd_item<1> __ndi) {
-                // TODO: Add kernel body
+            oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=,
+                                                            *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
+                                                               __sub_group_size)]] {
+                auto __id = __ndi.get_global_id(0);
+                auto __lid = __ndi.get_local_id(0);
+                auto __g = __ndi.get_group(0);
+                auto __sub_group = __ndi.get_sub_group();
+                auto __sub_group_id = __sub_group.get_group_linear_id();
+                auto __sub_group_local_id = __sub_group.get_local_linear_id();
+
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __sub_group_carry;
+                std::size_t __group_start_idx =
+                    (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
+                if (__n <= __group_start_idx)
+                    return; // exit early for empty groups (TODO: avoid launching these?)
+
+                std::size_t __elements_in_group =
+                    std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                std::uint32_t __active_subgroups =
+                    oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
+                std::size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
+
+                std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
+
+                if (__sub_group_id < __active_subgroups)
+                {
+                    // adjust for lane-id
+                    // compute sub-group local pfix on T0..63, K samples/T, send to accumulator kernel
+                    __scan_through_elements_helper<__sub_group_size, __is_inclusive,
+                                                   /*__init_present=*/false,
+                                                   /*__capture_output=*/false, __max_inputs_per_item>(
+                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, nullptr, __start_idx, __n,
+                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                    if (__sub_group_local_id == 0)
+                        __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
+                    __sub_group_carry.__destroy();
+                }
+                // TODO: This is slower then ndi.barrier which was removed in SYCL2020. Can we do anything about it?
+                //sycl::group_barrier(ndi.get_group());
+                __ndi.barrier(sycl::access::fence_space::local_space);
+
+                // compute sub-group local prefix sums on (T0..63) carries
+                // and store to scratch space at the end of dst; next
+                // accumulator kernel takes M thread carries from scratch
+                // to compute a prefix sum on global carries
+                if (__sub_group_id == 0)
+                {
+                    __start_idx = (__g * __num_sub_groups_local);
+                    std::uint8_t __iters =
+                        oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
+                    if (__iters == 1)
+                    {
+                        auto __load_idx = (__sub_group_local_id < __active_subgroups)
+                                              ? __sub_group_local_id
+                                              : (__active_subgroups - 1); // else is unused dummy value
+                        auto __v = __sub_group_partials[__load_idx];
+                        __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
+                            __sub_group, __v, __binary_op, __sub_group_carry,
+                            __active_subgroups - __subgroup_start_idx);
+                        if (__sub_group_local_id < __active_subgroups)
+                            __tmp_storage[__start_idx + __sub_group_local_id] = __v;
+                    }
+                    else
+                    {
+                        //need to pull out first iteration tp avoid identity
+                        auto __v = __sub_group_partials[__sub_group_local_id];
+                        __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
+                            __sub_group, __v, __binary_op, __sub_group_carry);
+                        __tmp_storage[__start_idx + __sub_group_local_id] = __v;
+
+                        for (int __i = 1; __i < __iters - 1; __i++)
+                        {
+                            __v = __sub_group_partials[__i * __sub_group_size + __sub_group_local_id];
+                            __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
+                                __sub_group, __v, __binary_op, __sub_group_carry);
+                            __tmp_storage[__start_idx + __i * __sub_group_size + __sub_group_local_id] = __v;
+                        }
+                        // If we are past the input range, then the previous value of v is passed to the sub-group scan.
+                        // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
+
+                        // else is an unused dummy value
+                        auto __proposed_idx = (__iters - 1) * __sub_group_size + __sub_group_local_id;
+                        auto __load_idx =
+                            (__proposed_idx < __num_sub_groups_local) ? __proposed_idx : (__num_sub_groups_local - 1);
+
+                        __v = __sub_group_partials[__load_idx];
+                        __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
+                            __sub_group, __v, __binary_op, __sub_group_carry, __num_sub_groups_local);
+                        if (__proposed_idx < __num_sub_groups_local)
+                            __tmp_storage[__start_idx + __proposed_idx] = __v;
+                    }
+
+                    __sub_group_carry.__destroy();
+                }
             });
         });
     }
+
+    // Constant parameters throughout all blocks
     const sycl::nd_range<1> __nd_range;
 
     const std::size_t __max_block_size;
@@ -79,32 +394,294 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 
     const _BinaryOperation __binary_op;
     const _UnaryOperation __unary_op;
-    const _WrappedInitType __wrapped_init;
+    _InitType __init;
 
     // TODO: Add the mask functors here to generalize for scan-based algorithms
 };
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
-          typename _UnaryOperation, typename _WrappedInitType, typename _KernelName>
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __inclusive, typename _BinaryOperation,
-          typename _UnaryOperation, typename _WrappedInitType, typename... _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                  _BinaryOperation, _UnaryOperation, _WrappedInitType,
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
+                                                  _BinaryOperation, _UnaryOperation, _InitType,
                                                   __internal::__optional_kernel_name<_KernelName...>>
 {
-    template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _TmpStorageAcc>
+    template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
     auto
-    operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _TmpStorageAcc __tmp_storage_acc,
+    operator()(_ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc __tmp_storage,
                const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
                const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
     {
-        return __exec.queue().submit([&, this](sycl::handler& __cgh) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
+        using _InValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        return __exec.queue().submit([&, this](sycl::handler& __cgh) {
+            sycl::local_accessor<_InValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
             __cgh.depends_on(__prior_event);
-            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2);
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=, *this](sycl::nd_item<1> __ndi) {
-                // TODO: Add kernel body
+            oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);
+            __cgh.parallel_for<_KernelName...>(__nd_range, [=,
+                                                            *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
+                                                               __sub_group_size)]] {
+                auto __id = __ndi.get_global_id(0);
+                auto __lid = __ndi.get_local_id(0);
+                auto __g = __ndi.get_group(0);
+                auto __sub_group = __ndi.get_sub_group();
+                auto __sub_group_id = __sub_group.get_group_linear_id();
+                auto __sub_group_local_id = __sub_group.get_local_linear_id();
+
+                auto __group_start_idx =
+                    (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
+                if (__n <= __group_start_idx)
+                    return; // exit early for empty groups (TODO: avoid launching these?)
+
+                std::size_t __elements_in_group =
+                    std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                std::uint32_t __active_subgroups =
+                    oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __carry_last;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __value;
+
+                // propogate carry in from previous block
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __sub_group_carry;
+
+                // on the first sub-group in a work-group (assuming S subgroups in a work-group):
+                // 1. load S sub-group local carry pfix sums (T0..TS-1) to slm
+                // 2. load 32, 64, 96, etc. TS-1 work-group carry-outs (32 for WG num<32, 64 for WG num<64, etc.),
+                //    and then compute the prefix sum to generate global carry out
+                //    for each WG, i.e., prefix sum on TS-1 carries over all WG.
+                // 3. on each WG select the adjacent neighboring WG carry in
+                // 4. on each WG add the global carry-in to S sub-group local pfix sums to
+                //    get a T-local global carry in
+                // 5. recompute T-local pfix values, add the T-local global carries,
+                //    and then write back the final values to memory
+                if (__sub_group_id == 0)
+                {
+                    // step 1) load to Xe slm the WG-local S prefix sums
+                    //         on WG T-local carries
+                    //            0: T0 carry, 1: T0 + T1 carry, 2: T0 + T1 + T2 carry, ...
+                    //           S: sum(T0 carry...TS carry)
+                    std::uint8_t __iters =
+                        oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
+                    auto __subgroups_before_my_group = __g * __num_sub_groups_local;
+                    std::uint8_t __i = 0;
+                    for (; __i < __iters - 1; __i++)
+                    {
+                        __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
+                            __tmp_storage[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                    }
+                    if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
+                    {
+                        __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
+                            __tmp_storage[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                    }
+
+                    // step 2) load 32, 64, 96, etc. work-group carry outs on every work-group; then
+                    //         compute the prefix in a sub-group to get global work-group carries
+                    //         memory accesses: gather(63, 127, 191, 255, ...)
+                    std::uint32_t __offset = __num_sub_groups_local - 1;
+                    // only need 32 carries for WGs0..WG32, 64 for WGs32..WGs64, etc.
+                    if (__g > 0)
+                    {
+                        // only need the last element from each scan of num_sub_groups_local subgroup reductions
+                        const auto __elements_to_process = __subgroups_before_my_group / __num_sub_groups_local;
+                        const auto __pre_carry_iters =
+                            oneapi::dpl::__internal::__dpl_ceiling_div(__elements_to_process, __sub_group_size);
+                        if (__pre_carry_iters == 1)
+                        {
+                            // single partial scan
+                            auto __proposed_idx = __num_sub_groups_local * __sub_group_local_id + __offset;
+                            auto __remaining_elements = __elements_to_process;
+                            auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
+                                                       ? __proposed_idx
+                                                       : __subgroups_before_my_group - 1;
+                            __value.__setup(__tmp_storage[__reduction_idx]);
+                            __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
+                                                     /*__init_present=*/false>(__sub_group, __value.__v, __binary_op,
+                                                                               __carry_last, __remaining_elements);
+                        }
+                        else
+                        {
+                            // multiple iterations
+                            // first 1 full
+                            __value.__setup(__tmp_storage[__num_sub_groups_local * __sub_group_local_id + __offset]);
+                            __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
+                                __sub_group, __value.__v, __binary_op, __carry_last);
+
+                            // then some number of full iterations
+                            for (int __i = 1; __i < __pre_carry_iters - 1; __i++)
+                            {
+                                auto __reduction_idx = __i * __num_sub_groups_local * __sub_group_size +
+                                                       __num_sub_groups_local * __sub_group_local_id + __offset;
+                                __value.__v = __tmp_storage[__reduction_idx];
+                                __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
+                                    __sub_group, __value.__v, __binary_op, __carry_last);
+                            }
+
+                            // final partial iteration
+                            auto __proposed_idx = (__pre_carry_iters - 1) * __num_sub_groups_local * __sub_group_size +
+                                                  __num_sub_groups_local * __sub_group_local_id + __offset;
+                            auto __remaining_elements =
+                                __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
+                            auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
+                                                       ? __proposed_idx
+                                                       : __subgroups_before_my_group - 1;
+                            __value.__v = __tmp_storage[__reduction_idx];
+                            __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
+                                                     /*__init_present=*/true>(__sub_group, __value.__v, __binary_op,
+                                                                              __carry_last, __remaining_elements);
+                        }
+                    }
+                }
+                // For the exclusive scan case:
+                // While the first sub-group is doing work, have the last item in the group store the last element
+                // in the block to temporary storage for use in the next block.
+                // This is required to support in-place exclusive scans as the input values will be overwritten.
+                if constexpr (!__is_inclusive)
+                {
+                    auto __global_id = __ndi.get_global_linear_id();
+                    if (__global_id == __num_work_items - 1)
+                    {
+                        std::size_t __last_idx_in_block = std::min(__n - 1, __max_block_size * (__block_num + 1) - 1);
+                        __tmp_storage[__num_sub_groups_global] = __in_rng[__last_idx_in_block];
+                    }
+                }
+
+                // N.B. barrier could be earlier, guarantees slm local carry update
+                //sycl::group_barrier(ndi.get_group());
+                __ndi.barrier(sycl::access::fence_space::local_space);
+
+                // steps 3/4) load global carry in from neighbor work-group
+                //            and apply to local sub-group prefix carries
+                if ((__sub_group_id == 0) && (__g > 0))
+                {
+                    auto __carry_offset = 0;
+
+                    std::uint8_t __iters =
+                        oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
+
+                    std::uint8_t __i = 0;
+                    for (; __i < __iters - 1; ++__i)
+                    {
+                        __sub_group_partials[__carry_offset + __sub_group_local_id] =
+                            __binary_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                        __carry_offset += __sub_group_size;
+                    }
+                    if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
+                    {
+                        __sub_group_partials[__carry_offset + __sub_group_local_id] =
+                            __binary_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                        __carry_offset += __sub_group_size;
+                    }
+                    if (__sub_group_local_id == 0)
+                        __sub_group_partials[__active_subgroups] = __carry_last.__v;
+                    __carry_last.__destroy();
+                }
+                __value.__destroy();
+
+                //sycl::group_barrier(ndi.get_group());
+                __ndi.barrier(sycl::access::fence_space::local_space);
+
+                // Get inter-work group and adjusted for intra-work group prefix
+                bool __sub_group_carry_initialized = true;
+                if (__block_num == 0)
+                {
+                    if (__sub_group_id > 0)
+                    {
+                        auto __value = __sub_group_partials[__sub_group_id - 1];
+                        oneapi::dpl::unseq_backend::__init_processing<_InValueType>{}(__init, __value, __binary_op);
+                        __sub_group_carry.__setup(__value);
+                    }
+                    else if (__g > 0)
+                    {
+                        auto __value = __sub_group_partials[__active_subgroups];
+                        oneapi::dpl::unseq_backend::__init_processing<_InValueType>{}(__init, __value, __binary_op);
+                        __sub_group_carry.__setup(__value);
+                    }
+                    else
+                    {
+                        if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<
+                                                                    typename _InitType::__value_type>>)
+                        {
+                            // This is the only case where we still don't have a carry in.  No init value, 0th block,
+                            // group, and subgroup. This changes the final scan through elements below.
+                            __sub_group_carry_initialized = false;
+                        }
+                        else
+                        {
+                            __sub_group_carry.__setup(__init.__value);
+                        }
+                    }
+                }
+                else
+                {
+                    if (__sub_group_id > 0)
+                    {
+                        if constexpr (__is_inclusive)
+                            __sub_group_carry.__setup(__binary_op(__out_rng[__block_num * __max_block_size - 1],
+                                                                  __sub_group_partials[__sub_group_id - 1]));
+                        else // The last block wrote an exclusive result, so we must make it inclusive.
+                        {
+                            // Grab the last element from the previous block that has been cached in temporary
+                            // storage in the second kernel of the previous block.
+                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
+                            __sub_group_carry.__setup(__binary_op(
+                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element),
+                                __sub_group_partials[__sub_group_id - 1]));
+                        }
+                    }
+                    else if (__g > 0)
+                    {
+                        if constexpr (__is_inclusive)
+                            __sub_group_carry.__setup(__binary_op(__out_rng[__block_num * __max_block_size - 1],
+                                                                  __sub_group_partials[__active_subgroups]));
+                        else // The last block wrote an exclusive result, so we must make it inclusive.
+                        {
+                            // Grab the last element from the previous block that has been cached in temporary
+                            // storage in the second kernel of the previous block.
+                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
+                            __sub_group_carry.__setup(__binary_op(
+                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element),
+                                __sub_group_partials[__active_subgroups]));
+                        }
+                    }
+                    else
+                    {
+                        if constexpr (__is_inclusive)
+                            __sub_group_carry.__setup(__out_rng[__block_num * __max_block_size - 1]);
+                        else // The last block wrote an exclusive result, so we must make it inclusive.
+                        {
+                            // Grab the last element from the previous block that has been cached in temporary
+                            // storage in the second kernel of the previous block.
+                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
+                            __sub_group_carry.__setup(
+                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element));
+                        }
+                    }
+                }
+
+                // step 5) apply global carries
+                size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
+                size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
+
+                if (__sub_group_carry_initialized)
+                {
+                    __scan_through_elements_helper<__sub_group_size, __is_inclusive,
+                                                   /*__init_present=*/true,
+                                                   /*__capture_output=*/true, __max_inputs_per_item>(
+                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, __out_rng, __start_idx, __n,
+                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
+
+                    __sub_group_carry.__destroy();
+                }
+                else // first group first block, no subgroup carry
+                {
+                    __scan_through_elements_helper<__sub_group_size, __is_inclusive,
+                                                   /*__init_present=*/false,
+                                                   /*__capture_output=*/true, __max_inputs_per_item>(
+                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, __out_rng, __start_idx, __n,
+                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                }
             });
         });
     }
@@ -119,16 +696,16 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
     const _BinaryOperation __binary_op;
     const _UnaryOperation __unary_op;
-    const _WrappedInitType __wrapped_init;
+    _InitType __init;
 
     // TODO: Add the mask functors here to generalize for scan-based algorithms
 };
 
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryOperation,
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _BinaryOperation,
           typename _UnaryOperation, typename _InitType, typename _Inclusive>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
-                                      _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op,
+                                      _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOperation __binary_op,
                                       _UnaryOperation __unary_op,
                                       _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
 {

From b465d840b471e899ac961321ba592747cb123473 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 11 Jul 2024 09:19:25 -0500
Subject: [PATCH 04/88] Move the single-element last element storage for
 exclusive_scan after the init computation

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index f0406e9e5e8..fc0b60fbeaf 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -533,19 +533,6 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                         }
                     }
                 }
-                // For the exclusive scan case:
-                // While the first sub-group is doing work, have the last item in the group store the last element
-                // in the block to temporary storage for use in the next block.
-                // This is required to support in-place exclusive scans as the input values will be overwritten.
-                if constexpr (!__is_inclusive)
-                {
-                    auto __global_id = __ndi.get_global_linear_id();
-                    if (__global_id == __num_work_items - 1)
-                    {
-                        std::size_t __last_idx_in_block = std::min(__n - 1, __max_block_size * (__block_num + 1) - 1);
-                        __tmp_storage[__num_sub_groups_global] = __in_rng[__last_idx_in_block];
-                    }
-                }
 
                 // N.B. barrier could be earlier, guarantees slm local carry update
                 //sycl::group_barrier(ndi.get_group());
@@ -659,6 +646,19 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                         }
                     }
                 }
+                // For the exclusive scan case:
+                // Have the last item in the group store the last element
+                // in the block to temporary storage for use in the next block.
+                // This is required to support in-place exclusive scans as the input values will be overwritten.
+                if constexpr (!__is_inclusive)
+                {
+                    auto __global_id = __ndi.get_global_linear_id();
+                    if (__global_id == __num_work_items - 1)
+                    {
+                        std::size_t __last_idx_in_block = std::min(__n - 1, __max_block_size * (__block_num + 1) - 1);
+                        __tmp_storage[__num_sub_groups_global] = __in_rng[__last_idx_in_block];
+                    }
+                }
 
                 // step 5) apply global carries
                 size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);

From 47360a004773be143e96c31fed8dbc0b94c9dc42 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 11 Jul 2024 13:37:39 -0700
Subject: [PATCH 05/88] Use init value type for init processing helper

---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index fc0b60fbeaf..b46f86f4826 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -416,6 +416,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
     {
         using _InValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
             sycl::local_accessor<_InValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
             __cgh.depends_on(__prior_event);
@@ -576,19 +577,18 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     if (__sub_group_id > 0)
                     {
                         auto __value = __sub_group_partials[__sub_group_id - 1];
-                        oneapi::dpl::unseq_backend::__init_processing<_InValueType>{}(__init, __value, __binary_op);
+                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __binary_op);
                         __sub_group_carry.__setup(__value);
                     }
                     else if (__g > 0)
                     {
                         auto __value = __sub_group_partials[__active_subgroups];
-                        oneapi::dpl::unseq_backend::__init_processing<_InValueType>{}(__init, __value, __binary_op);
+                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __binary_op);
                         __sub_group_carry.__setup(__value);
                     }
                     else
                     {
-                        if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<
-                                                                    typename _InitType::__value_type>>)
+                        if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>)
                         {
                             // This is the only case where we still don't have a carry in.  No init value, 0th block,
                             // group, and subgroup. This changes the final scan through elements below.
@@ -799,4 +799,4 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
 } // namespace dpl
 } // namespace oneapi
 
-#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
\ No newline at end of file
+#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H

From 3bf06025ff18db1068a88ce42540003dce5c5763 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 12 Jul 2024 14:37:04 -0700
Subject: [PATCH 06/88] Lower single work-group upper limit to 2048 elements
 (empirically found)

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index b40a147bbb4..0343f72e9dd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -756,7 +756,7 @@ template <typename _Type>
 bool
 __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::size_t __n_uniform)
 {
-    constexpr int __single_group_upper_limit = 16384;
+    constexpr int __single_group_upper_limit = 2048;
 
     // Pessimistically only use half of the memory to take into account memory used by compiled kernel
     const ::std::size_t __max_slm_size =

From 46c1a50e57469d4f292d183c54d89f4b5c28f100 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Date: Wed, 17 Jul 2024 17:32:53 -0400
Subject: [PATCH 07/88] [PROTOTYPE] Generalized two pass algorithm and copy_if
 (#1700)

This PR changes the two pass algorithm to be more generalized for use with other scan-like algorithms like copy_if.

This PR adds copy_if as an example

---------

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
Signed-off-by: Matthew Michel <matthew.michel@intel.com>
Co-authored-by: Adam Fidel <110841220+adamfidel@users.noreply.github.com>
Co-authored-by: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
---
 .../internal/exclusive_scan_by_segment_impl.h |   3 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 176 +++++++--
 .../parallel_backend_sycl_reduce_then_scan.h  | 350 +++++++++---------
 .../dpcpp/parallel_backend_sycl_utils.h       |   1 +
 4 files changed, 314 insertions(+), 216 deletions(-)

diff --git a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
index cd33872aa0d..266fad7e410 100644
--- a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
@@ -161,7 +161,8 @@ exclusive_scan_by_segment_impl(__internal::__hetero_tag<_BackendTag>, Policy&& p
     transform_inclusive_scan(::std::move(policy2), make_zip_iterator(_temp.get(), _flags.get()),
                              make_zip_iterator(_temp.get(), _flags.get()) + n, make_zip_iterator(result, _flags.get()),
                              internal::segmented_scan_fun<ValueType, FlagType, Operator>(binary_op),
-                             oneapi::dpl::__internal::__no_op(), ::std::make_tuple(init, FlagType(1)));
+                             oneapi::dpl::__internal::__no_op(),
+                             oneapi::dpl::__internal::make_tuple(init, FlagType(1)));
     return result + n;
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 0343f72e9dd..7027f792b40 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -766,6 +766,82 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::s
     return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
 }
 
+template <typename _UnaryOp>
+struct __gen_transform_input
+{
+    template <typename InRng>
+    auto
+    operator()(InRng&& __in_rng, std::size_t __idx) const
+    {
+        using _ValueType = oneapi::dpl::__internal::__value_t<InRng>;
+        using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<typename std::invoke_result<_UnaryOp, _ValueType>::type>;
+        return _OutValueType{__unary_op(__in_rng[__idx])};
+    }
+    _UnaryOp __unary_op;
+};
+
+struct __simple_write_to_idx
+{
+    template <typename _OutRng, typename ValueType>
+    void
+    operator()(_OutRng&& __out, std::size_t __idx, const ValueType& __v) const
+    {
+        __out[__idx] = __v;
+    }
+};
+
+template <typename _Predicate>
+struct __gen_count_pred
+{
+    template <typename _InRng, typename _SizeType>
+    _SizeType
+    operator()(_InRng&& __in_rng, _SizeType __idx) const
+    {
+        return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0};
+    }
+    _Predicate __pred;
+};
+
+template <typename _Predicate>
+struct __gen_expand_count_pred
+{
+    template <typename _InRng, typename _SizeType>
+    auto
+    operator()(_InRng&& __in_rng, _SizeType __idx) const
+    {
+        // Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
+        //  zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
+        //  the values of zipped the input types rather than their references.
+        using _ElementType =
+            oneapi::dpl::__internal::__decay_with_tuple_specialization_t<oneapi::dpl::__internal::__value_t<_InRng>>;
+        _ElementType ele = __in_rng[__idx];
+        bool mask = __pred(ele);
+        return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
+    }
+    _Predicate __pred;
+};
+
+struct __get_zeroth_element
+{
+    template <typename _Tp>
+    auto&
+    operator()(_Tp&& __a) const
+    {
+        return std::get<0>(std::forward<_Tp>(__a));
+    }
+};
+
+struct __write_to_idx_if
+{
+    template <typename _OutRng, typename _SizeType, typename ValueType>
+    void
+    operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
+    {
+        if (std::get<1>(__v))
+            __out[std::get<0>(__v) - 1] = std::get<2>(__v);
+    }
+};
+
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
           typename _BinaryOperation, typename _Inclusive>
 auto
@@ -774,39 +850,62 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                           _InitType __init, _BinaryOperation __binary_op, _Inclusive)
 {
     using _Type = typename _InitType::__value_type;
-
-    // Next power of 2 greater than or equal to __n
-    auto __n_uniform = __n;
-    if ((__n_uniform & (__n_uniform - 1)) != 0)
-        __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;
-
-    // TODO: can we reimplement this with support fort non-identities as well? We can then use in reduce-then-scan
-    // for the last block if it is sufficiently small
-    constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
-    if constexpr (__can_use_group_scan)
+    // Reduce-then-scan is dependent on sycl::shift_group_right which requires the underlying type to be trivially
+    // copyable. If this is not met, then we must fallback to the legacy implementation. The single work-group implementation
+    // requires a fundamental type which must also be trivially copyable.
+    if constexpr (std::is_trivially_copyable_v<_Type>)
     {
-        if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform))
+        // Next power of 2 greater than or equal to __n
+        auto __n_uniform = __n;
+        if ((__n_uniform & (__n_uniform - 1)) != 0)
+            __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;
+
+        // TODO: can we reimplement this with support for non-identities as well? We can then use in reduce-then-scan
+        // for the last block if it is sufficiently small
+        constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
+        if constexpr (__can_use_group_scan)
         {
-            return __parallel_transform_scan_single_group(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
-                ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
+            if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform))
+            {
+                return __parallel_transform_scan_single_group(
+                    __backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
+                    ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
+            }
         }
+        oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
+        return __future(__parallel_transform_reduce_then_scan(
+                            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                            std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
+                            oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{})
+                            .event());
+    }
+    else
+    {
+        using _Assigner = unseq_backend::__scan_assigner;
+        using _NoAssign = unseq_backend::__scan_no_assign;
+        using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
+        using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
+
+        _Assigner __assign_op;
+        _NoAssign __no_assign_op;
+        _NoOpFunctor __get_data_op;
+
+        return __future(
+            __parallel_transform_scan_base(
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                std::forward<_Range2>(__out_rng), __binary_op, __init,
+                // local scan
+                unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner,
+                                      _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op},
+                                                                          __assign_op, __assign_op, __get_data_op},
+                // scan between groups
+                unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
+                                      _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+                    __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
+                // global scan
+                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init})
+                .event());
     }
-
-    // TODO: Reintegrate once support has been added
-    //// Either we can't use group scan or this input is too big for one workgroup
-    //using _Assigner = unseq_backend::__scan_assigner;
-    //using _NoAssign = unseq_backend::__scan_no_assign;
-    //using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
-    //using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
-
-    //_Assigner __assign_op;
-    //_NoAssign __no_assign_op;
-    //_NoOpFunctor __get_data_op;
-    return __future(__parallel_transform_reduce_then_scan(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          ::std::forward<_Range1>(__in_rng), ::std::forward<_Range2>(__out_rng),
-                                                          __binary_op, __unary_op, __init, _Inclusive{})
-                    .event());
 }
 
 template <typename _SizeType>
@@ -912,15 +1011,14 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
     // The kernel stores n integers for the predicate and another n integers for the offsets
     const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;
 
-    constexpr ::std::uint16_t __single_group_upper_limit = 16384;
+    constexpr ::std::uint16_t __single_group_upper_limit = 2048;
 
     ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
     if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
         __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
     {
-        using _SizeBreakpoints =
-            ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
+        using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
 
         return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
             _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
@@ -929,13 +1027,15 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
     else
     {
         using _ReduceOp = ::std::plus<_Size>;
-        using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
-        using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
-                                                     /*inclusive*/ ::std::true_type, 1>;
 
-        return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
-                                    ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n,
-                                    CreateOp{__pred}, CopyOp{});
+        return __parallel_transform_reduce_then_scan(
+            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
+            std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Pred>{__pred},
+            _ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Pred>{__pred},
+            oneapi::dpl::__par_backend_hetero::__get_zeroth_element{},
+            oneapi::dpl::__par_backend_hetero::__write_to_idx_if{},
+            oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
+            /*_Inclusive=*/std::true_type{});
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index b46f86f4826..2979120ad1a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -148,10 +148,11 @@ __sub_group_scan_partial(const _SubGroup& __sub_group, _ValueType& __value, _Bin
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
-          std::uint32_t __max_inputs_per_item, typename _SubGroup, typename _UnaryOp, typename _BinaryOp,
-          typename _LazyValueType, typename _InRng, typename _OutRng>
+          std::uint32_t __max_inputs_per_item, typename _SubGroup, typename _GenInput, typename _ScanInputTransform,
+          typename _BinaryOp, typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
 void
-__scan_through_elements_helper(const _SubGroup& __sub_group, _UnaryOp __unary_op, _BinaryOp __binary_op,
+__scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_input,
+                               _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
                                _LazyValueType& __sub_group_carry, _InRng __in_rng, _OutRng __out_rng,
                                std::size_t __start_idx, std::size_t __n, std::uint32_t __iters_per_item,
                                std::size_t __subgroup_start_idx, std::uint32_t __sub_group_id,
@@ -161,43 +162,43 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _UnaryOp __unary_op
     bool __is_full_thread = __subgroup_start_idx + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread && __is_full_block)
     {
-        auto __v = __unary_op(__in_rng[__start_idx]);
-        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
-                                                                           __sub_group_carry);
+        auto __v = __gen_input(__in_rng, __start_idx);
+        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
+                                                                           __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __out_rng[__start_idx] = __v;
+            __write_op(__out_rng, __start_idx, __v);
         }
 
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
         {
-            __v = __unary_op(__in_rng[__start_idx + __j * __sub_group_size]);
-            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(__sub_group, __v, __binary_op,
-                                                                                        __sub_group_carry);
+            __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
             if constexpr (__capture_output)
             {
-                __out_rng[__start_idx + __j * __sub_group_size] = __v;
+                __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
             }
         }
     }
     else if (__is_full_thread)
     {
-        auto __v = __unary_op(__in_rng[__start_idx]);
-        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
-                                                                           __sub_group_carry);
+        auto __v = __gen_input(__in_rng, __start_idx);
+        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
+                                                                           __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __out_rng[__start_idx] = __v;
+            __write_op(__out_rng, __start_idx, __v);
         }
         for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
         {
-            __v = __unary_op(__in_rng[__start_idx + __j * __sub_group_size]);
-            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(__sub_group, __v, __binary_op,
-                                                                                        __sub_group_carry);
+            __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
             if constexpr (__capture_output)
             {
-                __out_rng[__start_idx + __j * __sub_group_size] = __v;
+                __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
             }
         }
     }
@@ -209,47 +210,48 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _UnaryOp __unary_op
 
             if (__iters == 1)
             {
-                auto __v = __unary_op(__in_rng[__start_idx]);
+                auto __v = __gen_input(__in_rng, __start_idx);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
-                    __sub_group, __v, __binary_op, __sub_group_carry, __n - __subgroup_start_idx);
+                    __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
+                    __n - __subgroup_start_idx);
                 if constexpr (__capture_output)
                 {
                     if (__start_idx < __n)
-                        __out_rng[__start_idx] = __v;
+                        __write_op(__out_rng, __start_idx, __v);
                 }
             }
             else
             {
-                auto __v = __unary_op(__in_rng[__start_idx]);
-                __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __v, __binary_op,
-                                                                                   __sub_group_carry);
+                auto __v = __gen_input(__in_rng, __start_idx);
+                __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(
+                    __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __out_rng[__start_idx] = __v;
+                    __write_op(__out_rng, __start_idx, __v);
                 }
 
                 for (int __j = 1; __j < __iters - 1; __j++)
                 {
                     auto __local_idx = __start_idx + __j * __sub_group_size;
-                    __v = __unary_op(__in_rng[__local_idx]);
+                    __v = __gen_input(__in_rng, __local_idx);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
-                        __sub_group, __v, __binary_op, __sub_group_carry);
+                        __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                     if constexpr (__capture_output)
                     {
-                        __out_rng[__local_idx] = __v;
+                        __write_op(__out_rng, __local_idx, __v);
                     }
                 }
 
                 auto __offset = __start_idx + (__iters - 1) * __sub_group_size;
                 auto __local_idx = (__offset < __n) ? __offset : __n - 1;
-                __v = __unary_op(__in_rng[__local_idx]);
+                __v = __gen_input(__in_rng, __local_idx);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
-                    __sub_group, __v, __binary_op, __sub_group_carry,
+                    __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - (__subgroup_start_idx + (__iters - 1) * __sub_group_size));
                 if constexpr (__capture_output)
                 {
                     if (__offset < __n)
-                        __out_rng[__offset] = __v;
+                        __write_op(__out_rng, __offset, __v);
                 }
             }
         }
@@ -263,43 +265,42 @@ template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename... _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
-                                                    _BinaryOperation, _UnaryOperation, _InitType,
+                                                    _GenReduceInput, _ReduceOp, _InitType,
                                                     __internal::__optional_kernel_name<_KernelName...>>
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
     template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
     auto
-    operator()(_ExecutionPolicy&& __exec, _InRng&& __in_rng, _TmpStorageAcc __tmp_storage,
-               const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
-               const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
+               _TmpStorageAcc __scratch_container, const sycl::event& __prior_event,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num) const
     {
-        using _InValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
-            sycl::local_accessor<_InValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
+            sycl::local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
+            auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
             __cgh.parallel_for<_KernelName...>(__nd_range, [=,
                                                             *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
                                                                __sub_group_size)]] {
-                auto __id = __ndi.get_global_id(0);
-                auto __lid = __ndi.get_local_id(0);
+                auto __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 auto __g = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
                 auto __sub_group_id = __sub_group.get_group_linear_id();
                 auto __sub_group_local_id = __sub_group.get_local_linear_id();
 
-                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __sub_group_carry;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
-                if (__n <= __group_start_idx)
-                    return; // exit early for empty groups (TODO: avoid launching these?)
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
@@ -316,8 +317,9 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     __scan_through_elements_helper<__sub_group_size, __is_inclusive,
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
-                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, nullptr, __start_idx, __n,
-                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                        __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr,
+                        __sub_group_carry, __in_rng, nullptr, __start_idx, __n, __inputs_per_item, __subgroup_start_idx,
+                        __sub_group_id, __active_subgroups);
                     if (__sub_group_local_id == 0)
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
                     __sub_group_carry.__destroy();
@@ -342,25 +344,25 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                               : (__active_subgroups - 1); // else is unused dummy value
                         auto __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
-                            __sub_group, __v, __binary_op, __sub_group_carry,
+                            __sub_group, __v, __reduce_op, __sub_group_carry,
                             __active_subgroups - __subgroup_start_idx);
                         if (__sub_group_local_id < __active_subgroups)
-                            __tmp_storage[__start_idx + __sub_group_local_id] = __v;
+                            __temp_ptr[__start_idx + __sub_group_local_id] = __v;
                     }
                     else
                     {
                         //need to pull out first iteration tp avoid identity
                         auto __v = __sub_group_partials[__sub_group_local_id];
                         __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
-                            __sub_group, __v, __binary_op, __sub_group_carry);
-                        __tmp_storage[__start_idx + __sub_group_local_id] = __v;
+                            __sub_group, __v, __reduce_op, __sub_group_carry);
+                        __temp_ptr[__start_idx + __sub_group_local_id] = __v;
 
                         for (int __i = 1; __i < __iters - 1; __i++)
                         {
                             __v = __sub_group_partials[__i * __sub_group_size + __sub_group_local_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
-                                __sub_group, __v, __binary_op, __sub_group_carry);
-                            __tmp_storage[__start_idx + __i * __sub_group_size + __sub_group_local_id] = __v;
+                                __sub_group, __v, __reduce_op, __sub_group_carry);
+                            __temp_ptr[__start_idx + __i * __sub_group_size + __sub_group_local_id] = __v;
                         }
                         // If we are past the input range, then the previous value of v is passed to the sub-group scan.
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
@@ -372,9 +374,9 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 
                         __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
-                            __sub_group, __v, __binary_op, __sub_group_carry, __num_sub_groups_local);
+                            __sub_group, __v, __reduce_op, __sub_group_carry, __num_sub_groups_local);
                         if (__proposed_idx < __num_sub_groups_local)
-                            __tmp_storage[__start_idx + __proposed_idx] = __v;
+                            __temp_ptr[__start_idx + __proposed_idx] = __v;
                     }
 
                     __sub_group_carry.__destroy();
@@ -384,47 +386,53 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     }
 
     // Constant parameters throughout all blocks
-    const sycl::nd_range<1> __nd_range;
-
     const std::size_t __max_block_size;
     const std::size_t __num_sub_groups_local;
     const std::size_t __num_sub_groups_global;
     const std::size_t __num_work_items;
     const std::size_t __n;
 
-    const _BinaryOperation __binary_op;
-    const _UnaryOperation __unary_op;
+    const _GenReduceInput __gen_reduce_input;
+    const _ReduceOp __reduce_op;
     _InitType __init;
-
-    // TODO: Add the mask functors here to generalize for scan-based algorithms
 };
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
+          typename _WriteOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _BinaryOperation, typename _UnaryOperation, typename _InitType, typename... _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
-                                                  _BinaryOperation, _UnaryOperation, _InitType,
-                                                  __internal::__optional_kernel_name<_KernelName...>>
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
+          typename _WriteOp, typename _InitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<
+    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput,
+    _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
     template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
     auto
-    operator()(_ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc __tmp_storage,
-               const sycl::event& __prior_event, const std::size_t __inputs_per_sub_group,
-               const std::size_t __inputs_per_item, const std::size_t __block_num, const bool __is_full_block) const
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
+               _TmpStorageAcc __scratch_container, const sycl::event& __prior_event,
+               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::size_t __block_num) const
     {
-        using _InValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        std::size_t __elements_in_block = std::min(__n - __block_num * __max_block_size, std::size_t(__max_block_size));
+        std::size_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
+            __elements_in_block, __inputs_per_sub_group * __num_sub_groups_local);
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
-            sycl::local_accessor<_InValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
+            sycl::local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);
+            auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
+            auto __res_acc = __scratch_container.__get_result_acc(__cgh);
+
             __cgh.parallel_for<_KernelName...>(__nd_range, [=,
                                                             *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
                                                                __sub_group_size)]] {
-                auto __id = __ndi.get_global_id(0);
+                auto __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
+                auto __res_ptr =
+                    _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 1);
                 auto __lid = __ndi.get_local_id(0);
                 auto __g = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
@@ -433,18 +441,16 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
                 auto __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
-                if (__n <= __group_start_idx)
-                    return; // exit early for empty groups (TODO: avoid launching these?)
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
-                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __carry_last;
-                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __value;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __value;
 
                 // propogate carry in from previous block
-                oneapi::dpl::__internal::__lazy_ctor_storage<_InValueType> __sub_group_carry;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
 
                 // on the first sub-group in a work-group (assuming S subgroups in a work-group):
                 // 1. load S sub-group local carry pfix sums (T0..TS-1) to slm
@@ -469,12 +475,12 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     for (; __i < __iters - 1; __i++)
                     {
                         __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
-                            __tmp_storage[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                            __tmp_ptr[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
                     }
                     if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
                     {
                         __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
-                            __tmp_storage[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                            __tmp_ptr[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
                     }
 
                     // step 2) load 32, 64, 96, etc. work-group carry outs on every work-group; then
@@ -496,27 +502,27 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                             auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
                                                        ? __proposed_idx
                                                        : __subgroups_before_my_group - 1;
-                            __value.__setup(__tmp_storage[__reduction_idx]);
+                            __value.__setup(__tmp_ptr[__reduction_idx]);
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
-                                                     /*__init_present=*/false>(__sub_group, __value.__v, __binary_op,
+                                                     /*__init_present=*/false>(__sub_group, __value.__v, __reduce_op,
                                                                                __carry_last, __remaining_elements);
                         }
                         else
                         {
                             // multiple iterations
                             // first 1 full
-                            __value.__setup(__tmp_storage[__num_sub_groups_local * __sub_group_local_id + __offset]);
+                            __value.__setup(__tmp_ptr[__num_sub_groups_local * __sub_group_local_id + __offset]);
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
-                                __sub_group, __value.__v, __binary_op, __carry_last);
+                                __sub_group, __value.__v, __reduce_op, __carry_last);
 
                             // then some number of full iterations
                             for (int __i = 1; __i < __pre_carry_iters - 1; __i++)
                             {
                                 auto __reduction_idx = __i * __num_sub_groups_local * __sub_group_size +
                                                        __num_sub_groups_local * __sub_group_local_id + __offset;
-                                __value.__v = __tmp_storage[__reduction_idx];
+                                __value.__v = __tmp_ptr[__reduction_idx];
                                 __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
-                                    __sub_group, __value.__v, __binary_op, __carry_last);
+                                    __sub_group, __value.__v, __reduce_op, __carry_last);
                             }
 
                             // final partial iteration
@@ -527,9 +533,9 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                             auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
                                                        ? __proposed_idx
                                                        : __subgroups_before_my_group - 1;
-                            __value.__v = __tmp_storage[__reduction_idx];
+                            __value.__v = __tmp_ptr[__reduction_idx];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
-                                                     /*__init_present=*/true>(__sub_group, __value.__v, __binary_op,
+                                                     /*__init_present=*/true>(__sub_group, __value.__v, __reduce_op,
                                                                               __carry_last, __remaining_elements);
                         }
                     }
@@ -552,13 +558,13 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     for (; __i < __iters - 1; ++__i)
                     {
                         __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                            __binary_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
                         __carry_offset += __sub_group_size;
                     }
                     if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
                     {
                         __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                            __binary_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
                         __carry_offset += __sub_group_size;
                     }
                     if (__sub_group_local_id == 0)
@@ -577,18 +583,19 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     if (__sub_group_id > 0)
                     {
                         auto __value = __sub_group_partials[__sub_group_id - 1];
-                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __binary_op);
+                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
                     else if (__g > 0)
                     {
                         auto __value = __sub_group_partials[__active_subgroups];
-                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __binary_op);
+                        oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
                     else
                     {
-                        if constexpr (std::is_same_v<_InitType, oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>)
+                        if constexpr (std::is_same_v<_InitType,
+                                                     oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>)
                         {
                             // This is the only case where we still don't have a carry in.  No init value, 0th block,
                             // group, and subgroup. This changes the final scan through elements below.
@@ -604,109 +611,96 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 {
                     if (__sub_group_id > 0)
                     {
-                        if constexpr (__is_inclusive)
-                            __sub_group_carry.__setup(__binary_op(__out_rng[__block_num * __max_block_size - 1],
-                                                                  __sub_group_partials[__sub_group_id - 1]));
-                        else // The last block wrote an exclusive result, so we must make it inclusive.
-                        {
-                            // Grab the last element from the previous block that has been cached in temporary
-                            // storage in the second kernel of the previous block.
-                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
-                            __sub_group_carry.__setup(__binary_op(
-                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element),
-                                __sub_group_partials[__sub_group_id - 1]));
-                        }
+                        __sub_group_carry.__setup(
+                            __reduce_op(__tmp_ptr[__num_sub_groups_global], __sub_group_partials[__sub_group_id - 1]));
                     }
                     else if (__g > 0)
                     {
-                        if constexpr (__is_inclusive)
-                            __sub_group_carry.__setup(__binary_op(__out_rng[__block_num * __max_block_size - 1],
-                                                                  __sub_group_partials[__active_subgroups]));
-                        else // The last block wrote an exclusive result, so we must make it inclusive.
-                        {
-                            // Grab the last element from the previous block that has been cached in temporary
-                            // storage in the second kernel of the previous block.
-                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
-                            __sub_group_carry.__setup(__binary_op(
-                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element),
-                                __sub_group_partials[__active_subgroups]));
-                        }
+                        __sub_group_carry.__setup(
+                            __reduce_op(__tmp_ptr[__num_sub_groups_global], __sub_group_partials[__active_subgroups]));
                     }
                     else
                     {
-                        if constexpr (__is_inclusive)
-                            __sub_group_carry.__setup(__out_rng[__block_num * __max_block_size - 1]);
-                        else // The last block wrote an exclusive result, so we must make it inclusive.
-                        {
-                            // Grab the last element from the previous block that has been cached in temporary
-                            // storage in the second kernel of the previous block.
-                            _InValueType __last_block_element = __unary_op(__tmp_storage[__num_sub_groups_global]);
-                            __sub_group_carry.__setup(
-                                __binary_op(__out_rng[__block_num * __max_block_size - 1], __last_block_element));
-                        }
-                    }
-                }
-                // For the exclusive scan case:
-                // Have the last item in the group store the last element
-                // in the block to temporary storage for use in the next block.
-                // This is required to support in-place exclusive scans as the input values will be overwritten.
-                if constexpr (!__is_inclusive)
-                {
-                    auto __global_id = __ndi.get_global_linear_id();
-                    if (__global_id == __num_work_items - 1)
-                    {
-                        std::size_t __last_idx_in_block = std::min(__n - 1, __max_block_size * (__block_num + 1) - 1);
-                        __tmp_storage[__num_sub_groups_global] = __in_rng[__last_idx_in_block];
+                        __sub_group_carry.__setup(__tmp_ptr[__num_sub_groups_global]);
                     }
                 }
 
                 // step 5) apply global carries
-                size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
-                size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
+                std::size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
+                std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
 
                 if (__sub_group_carry_initialized)
                 {
                     __scan_through_elements_helper<__sub_group_size, __is_inclusive,
                                                    /*__init_present=*/true,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
-                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, __out_rng, __start_idx, __n,
-                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
-
-                    __sub_group_carry.__destroy();
+                        __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
+                        __sub_group_carry, __in_rng, __out_rng, __start_idx, __n, __inputs_per_item,
+                        __subgroup_start_idx, __sub_group_id, __active_subgroups);
                 }
                 else // first group first block, no subgroup carry
                 {
                     __scan_through_elements_helper<__sub_group_size, __is_inclusive,
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
-                        __sub_group, __unary_op, __binary_op, __sub_group_carry, __in_rng, __out_rng, __start_idx, __n,
-                        __inputs_per_item, __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                        __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
+                        __sub_group_carry, __in_rng, __out_rng, __start_idx, __n, __inputs_per_item,
+                        __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                }
+                //If within the last active group and subgroup of the block, use the 0th work item of the subgroup
+                // to write out the last carry out for either the return value or the next block
+                if (__sub_group_local_id == 0 && (__active_groups == __g + 1) &&
+                    (__active_subgroups == __sub_group_id + 1))
+                {
+                    if (__block_num + 1 == __num_blocks)
+                    {
+                        __res_ptr[0] = __sub_group_carry.__v;
+                    }
+                    else
+                    {
+                        //capture the last carry out for the next block
+                        __tmp_ptr[__num_sub_groups_global] = __sub_group_carry.__v;
+                    }
                 }
+
+                __sub_group_carry.__destroy();
             });
         });
     }
 
-    const sycl::nd_range<1> __nd_range;
-
     const std::size_t __max_block_size;
     const std::size_t __num_sub_groups_local;
     const std::size_t __num_sub_groups_global;
     const std::size_t __num_work_items;
+    const std::size_t __num_blocks;
     const std::size_t __n;
 
-    const _BinaryOperation __binary_op;
-    const _UnaryOperation __unary_op;
+    const _GenReduceInput __gen_reduce_input;
+    const _ReduceOp __reduce_op;
+    const _GenScanInput __gen_scan_input;
+    const _ScanInputTransform __scan_input_transform;
+    const _WriteOp __write_op;
     _InitType __init;
-
-    // TODO: Add the mask functors here to generalize for scan-based algorithms
 };
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _BinaryOperation,
-          typename _UnaryOperation, typename _InitType, typename _Inclusive>
+// General scan-like algorithm helpers
+// _GenReduceInput - a function which accepts the input range and index to generate the data needed by the main output
+//                   used in the reduction operation (to calculate the global carries)
+// _GenScanInput - a function which accepts the input range and index to generate the data needed by the final scan
+//                 and write operations, for scan patterns
+// _ScanInputTransform - a unary function applied to the ouput of `_GenScanInput` to extract the component used in the scan, but
+//             not the part only required for the final write operation
+// _ReduceOp - a binary function which is used in the reduction and scan operations
+// _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range
+//            and performs the final write to output operation
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _GenReduceInput, typename _ReduceOp,
+          typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
+          typename _Inclusive>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
-                                      _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOperation __binary_op,
-                                      _UnaryOperation __unary_op,
+                                      _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input,
+                                      _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,
+                                      _ScanInputTransform __scan_input_transform, _WriteOp __write_op,
                                       _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
@@ -739,28 +733,27 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
             : std::max(__sub_group_size,
                        oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
     auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const auto __global_range = sycl::range<1>(__num_work_items);
-    const auto __local_range = sycl::range<1>(__work_group_size);
-    const auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
     const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
     const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
 
-    // TODO: Use the trick in reduce to wrap in a shared_ptr with custom deleter to support asynchronous frees.
-    _ValueType* __tmp_storage = sycl::malloc_device<_ValueType>(__num_sub_groups_global + 1, __exec.queue());
+    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec,
+                                                                                    __num_sub_groups_global + 1};
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =
         __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                     _BinaryOperation, _UnaryOperation, _InitType, _ReduceKernel>;
+                                                     _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>;
     using _ScanSubmitter =
         __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                   _BinaryOperation, _UnaryOperation, _InitType, _ScanKernel>;
+                                                   _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform,
+                                                   _WriteOp, _InitType, _ScanKernel>;
     // TODO: remove below before merging. used for convenience now
     // clang-format off
-    _ReduceSubmitter __reduce_submitter{__kernel_nd_range, __max_inputs_per_block, __num_sub_groups_local,
-        __num_sub_groups_global, __num_work_items, __n, __binary_op, __unary_op, __init};
-    _ScanSubmitter __scan_submitter{__kernel_nd_range, __max_inputs_per_block, __num_sub_groups_local, 
-        __num_sub_groups_global, __num_work_items, __n, __binary_op, __unary_op, __init};
+    _ReduceSubmitter __reduce_submitter{__max_inputs_per_block, __num_sub_groups_local,
+        __num_sub_groups_global, __num_work_items, __n, __gen_reduce_input, __reduce_op, __init};
+    _ScanSubmitter __scan_submitter{__max_inputs_per_block, __num_sub_groups_local,
+        __num_sub_groups_global, __num_work_items, __num_blocks, __n, __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
+        __write_op, __init};
     // clang-format on
 
     sycl::event __event;
@@ -768,13 +761,19 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // with sufficiently large L2 / L3 caches.
     for (std::size_t __b = 0; __b < __num_blocks; ++__b)
     {
-        bool __is_full_block = __inputs_per_item == __max_inputs_per_item;
+        auto __elements_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
+            std::min(__num_remaining, __max_inputs_per_block), __inputs_per_item);
+        auto __ele_in_block_round_up_workgroup =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_block, __work_group_size) * __work_group_size;
+        auto __global_range = sycl::range<1>(__ele_in_block_round_up_workgroup);
+        auto __local_range = sycl::range<1>(__work_group_size);
+        auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
-        __event = __reduce_submitter(__exec, __in_rng, __tmp_storage, __event, __inputs_per_sub_group,
-                                     __inputs_per_item, __b, __is_full_block);
+        __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
+                                     __inputs_per_sub_group, __inputs_per_item, __b);
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
-        __event = __scan_submitter(__exec, __in_rng, __out_rng, __tmp_storage, __event, __inputs_per_sub_group,
-                                   __inputs_per_item, __b, __is_full_block);
+        __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
+                                   __inputs_per_sub_group, __inputs_per_item, __b);
         if (__num_remaining > __block_size)
         {
             // Resize for the next block.
@@ -789,10 +788,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
             __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
         }
     }
-    // TODO: Remove to make asynchronous. Depends on completing async USM free TODO.
-    __event.wait();
-    sycl::free(__tmp_storage, __exec.queue());
-    return __future(__event);
+    return __future(__event, __result_and_scratch);
 }
 
 } // namespace __par_backend_hetero
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index c7d46dd2057..663a7edfa47 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -507,6 +507,7 @@ struct __usm_or_buffer_accessor
 template <typename _ExecutionPolicy, typename _T>
 struct __result_and_scratch_storage
 {
+    using __value_type = _T;
   private:
     using __sycl_buffer_t = sycl::buffer<_T, 1>;
 

From 38c1b192b2ef47f95dd33b199fcd6bc55ef88e5c Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 18 Jul 2024 09:50:06 -0400
Subject: [PATCH 08/88] bug fix for global race on block carry-out

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 2979120ad1a..14d2f8b728a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -409,6 +409,19 @@ struct __parallel_reduce_then_scan_scan_submitter<
     __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput,
     _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
+
+    template <typename _TmpPtr>
+    auto __get_block_carry_in(const std::size_t __block_num, _TmpPtr __tmp_ptr) const
+    {
+        return __tmp_ptr[__num_sub_groups_global + (__block_num % 2)];
+    }
+
+    template <typename _TmpPtr, typename _ValueType>
+    void __set_block_carry_out(const std::size_t __block_num, _TmpPtr __tmp_ptr, const _ValueType __block_carry_out) const
+    {
+        __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
+    }
+
     template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
     auto
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
@@ -432,7 +445,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                                __sub_group_size)]] {
                 auto __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 auto __res_ptr =
-                    _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 1);
+                    _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
                 auto __lid = __ndi.get_local_id(0);
                 auto __g = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
@@ -612,16 +625,16 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     if (__sub_group_id > 0)
                     {
                         __sub_group_carry.__setup(
-                            __reduce_op(__tmp_ptr[__num_sub_groups_global], __sub_group_partials[__sub_group_id - 1]));
+                            __reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __sub_group_partials[__sub_group_id - 1]));
                     }
                     else if (__g > 0)
                     {
                         __sub_group_carry.__setup(
-                            __reduce_op(__tmp_ptr[__num_sub_groups_global], __sub_group_partials[__active_subgroups]));
+                            __reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __sub_group_partials[__active_subgroups]));
                     }
                     else
                     {
-                        __sub_group_carry.__setup(__tmp_ptr[__num_sub_groups_global]);
+                        __sub_group_carry.__setup(__get_block_carry_in(__block_num, __tmp_ptr));
                     }
                 }
 
@@ -659,7 +672,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     else
                     {
                         //capture the last carry out for the next block
-                        __tmp_ptr[__num_sub_groups_global] = __sub_group_carry.__v;
+                        __set_block_carry_out(__block_num, __tmp_ptr, __sub_group_carry.__v);
                     }
                 }
 
@@ -736,8 +749,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
     const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
 
+    //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
+    // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
+    // the block carry-out within a single kernel.
     __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec,
-                                                                                    __num_sub_groups_global + 1};
+                                                                                    __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =

From 72d42c21af99446475408b3a04b1c4bb6fe8d3e1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 18 Jul 2024 12:53:02 -0400
Subject: [PATCH 09/88] bugfix for elements to process in partial subgroup scan

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 14d2f8b728a..b553f6cfe8c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -345,7 +345,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         auto __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry,
-                            __active_subgroups - __subgroup_start_idx);
+                            __active_subgroups);
                         if (__sub_group_local_id < __active_subgroups)
                             __temp_ptr[__start_idx + __sub_group_local_id] = __v;
                     }
@@ -374,7 +374,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 
                         __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
-                            __sub_group, __v, __reduce_op, __sub_group_carry, __num_sub_groups_local);
+                            __sub_group, __v, __reduce_op, __sub_group_carry,
+                            __active_subgroups - ((__iters - 1) * __sub_group_size));
                         if (__proposed_idx < __num_sub_groups_local)
                             __temp_ptr[__start_idx + __proposed_idx] = __v;
                     }

From ecce124e7473a777bd9388aa01c81e7551f19081 Mon Sep 17 00:00:00 2001
From: Adam Fidel <110841220+adamfidel@users.noreply.github.com>
Date: Thu, 18 Jul 2024 15:17:30 -0700
Subject: [PATCH 10/88] [PROTOTYPE] Add unused temporary storage to single
 work-group scan to fix use-after free error (#1712)

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 +++++-------
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h   |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 7027f792b40..3d7a45950fd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -694,7 +694,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
                     oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_single_wg_kernel<
                         ::std::integral_constant<::std::uint16_t, __wg_size>,
                         ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, _BinaryOperation,
-                        /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()(
+                        /* _IsFullGroup= */ ::std::false_type, _Inclusive, _TempStorage, _CustomName>>>()(
                     ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
                     std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
             return __future(__event, __dummy_result_and_scratch);
@@ -873,11 +873,10 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             }
         }
         oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
-        return __future(__parallel_transform_reduce_then_scan(
+        return __parallel_transform_reduce_then_scan(
                             __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
                             std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
-                            oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{})
-                            .event());
+                            oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
     }
     else
     {
@@ -890,7 +889,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         _NoAssign __no_assign_op;
         _NoOpFunctor __get_data_op;
 
-        return __future(
+        return
             __parallel_transform_scan_base(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
                 std::forward<_Range2>(__out_rng), __binary_op, __init,
@@ -903,8 +902,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                                       _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
                     __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
                 // global scan
-                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init})
-                .event());
+                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index b553f6cfe8c..6dad5c1f623 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -753,7 +753,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
     // the block carry-out within a single kernel.
-    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec,
+    __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _ValueType> __result_and_scratch{__exec,
                                                                                     __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations

From 39ebdbec78467e870596455c81ede8da32dc13d6 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 18 Jul 2024 19:17:03 -0500
Subject: [PATCH 11/88] Add temporary work-group size cap for FPGA_EMU testing

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 6dad5c1f623..36558912edf 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -731,7 +731,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
 
     // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
     // that no check is needed?
-    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    // TODO: This min call is temporary until PR #1683 is merged.
+    const std::size_t __work_group_size = std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
 
     // TODO: base on max compute units. Recall disconnect in vendor definitions (# SMs vs. # XVEs)
     const std::size_t __num_work_groups = 128;

From e4e30e101e184b495b1c2991bed34afda3ea8827 Mon Sep 17 00:00:00 2001
From: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:44:22 -0500
Subject: [PATCH 12/88] [PROTOTYPE] Resolve conversion issues between internal
 tuple and std::tuple in zip_iterator.pass (#1714)

* Fix for zip_iterator.pass in copy_if assignment

Signed-off-by: Matthew Michel <matthew.michel@intel.com>

* Add similar fix to __simple_write_to_idx

Signed-off-by: Matthew Michel <matthew.michel@intel.com>

---------

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h  | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 3d7a45950fd..c7316ffd947 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -786,7 +786,12 @@ struct __simple_write_to_idx
     void
     operator()(_OutRng&& __out, std::size_t __idx, const ValueType& __v) const
     {
-        __out[__idx] = __v;
+        // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
+        // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
+        using _ConvertedTupleType =
+            typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(__v)>,
+                                                               std::decay_t<decltype(__out[__idx])>>::__type;
+        __out[__idx] = static_cast<_ConvertedTupleType>(__v);
     }
 };
 
@@ -837,8 +842,13 @@ struct __write_to_idx_if
     void
     operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
     {
+        // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
+        // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
+        using _ConvertedTupleType =
+            typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
+                                                               std::decay_t<decltype(__out[__idx])>>::__type;
         if (std::get<1>(__v))
-            __out[std::get<0>(__v) - 1] = std::get<2>(__v);
+            __out[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v));
     }
 };
 

From 3732c124dbea7925ab6c3d7979c387635374d7a3 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 22 Jul 2024 09:21:45 -0700
Subject: [PATCH 13/88] Use __dpl_sycl::__local_accessor

---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 36558912edf..7e87bd344a5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -285,7 +285,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     {
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
-            sycl::local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
+            __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
             auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
@@ -435,7 +435,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
             __elements_in_block, __inputs_per_sub_group * __num_sub_groups_local);
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
-            sycl::local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
+            __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);
             auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);

From 1745e0c193dbe8ae0d51539c32fd9e408fb8311f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 22 Jul 2024 18:01:12 -0400
Subject: [PATCH 14/88] bugfix for overruning input for small non multiples of
 subgroup size

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 7e87bd344a5..94027a31eae 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -210,7 +210,8 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
 
             if (__iters == 1)
             {
-                auto __v = __gen_input(__in_rng, __start_idx);
+                auto __local_idx = (__start_idx < __n) ? __start_idx : __n - 1;
+                auto __v = __gen_input(__in_rng, __local_idx);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - __subgroup_start_idx);

From 0921941d90e2cf869187e59a1ecfae738b8a453b Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 23 Jul 2024 11:29:10 -0700
Subject: [PATCH 15/88] Check if a subgroup is active before fetching its carry
 and grab the last active subgroup otherwise

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 94027a31eae..d216e8e36ac 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -597,7 +597,9 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 {
                     if (__sub_group_id > 0)
                     {
-                        auto __value = __sub_group_partials[__sub_group_id - 1];
+                        auto __value = __sub_group_id - 1 < __active_subgroups
+                                           ? __sub_group_partials[__sub_group_id - 1]
+                                           : __sub_group_partials[__active_subgroups - 1];
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
@@ -626,8 +628,10 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 {
                     if (__sub_group_id > 0)
                     {
-                        __sub_group_carry.__setup(
-                            __reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __sub_group_partials[__sub_group_id - 1]));
+                        auto __value = __sub_group_id - 1 < __active_subgroups
+                                           ? __sub_group_partials[__sub_group_id - 1]
+                                           : __sub_group_partials[__active_subgroups - 1];
+                        __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __value));
                     }
                     else if (__g > 0)
                     {

From 8effa033c5b353f0965f75a8cdbcc719eebca793 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 23 Jul 2024 11:48:12 -0700
Subject: [PATCH 16/88] Comment out std::complex tests in scan_by_segment tests

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp   | 4 ++++
 .../numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp   | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
index a856f83afc7..51b02c0c43e 100644
--- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
@@ -254,6 +254,9 @@ main()
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
 
+    // TODO: Investigate why -fno-fast-math flag causes failures in icpx 2024.2.0 + CPU + Release build with these tests. Additionally, verify
+    // if we should stop testing with std::complex as it is not officially supported in SYCL kernels.
+#if 0
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;
@@ -274,6 +277,7 @@ main()
         test_algo_three_sequences<ValueType, test_exclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
+#endif
 
     return TestUtils::done();
 }
diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
index 1e955a729c3..c52c47feee0 100644
--- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
@@ -221,7 +221,9 @@ main()
         test_algo_three_sequences<ValueType, test_inclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
-
+    // TODO: Investigate why -fno-fast-math flag causes failures in icpx 2024.2.0 + CPU + Release build with these tests. Additionally, verify
+    // if we should stop testing with std::complex as it is not officially supported in SYCL kernels.
+#if 0
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;
@@ -242,6 +244,7 @@ main()
         test_algo_three_sequences<ValueType, test_inclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
+#endif
 
     return TestUtils::done();
 }

From c22231aaee3940e58dc3792f34eae72e81abe74c Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 24 Jul 2024 14:46:41 -0400
Subject: [PATCH 17/88] renaming __out as it seems to be a keyword

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c7316ffd947..5200eb6f5d8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -784,14 +784,14 @@ struct __simple_write_to_idx
 {
     template <typename _OutRng, typename ValueType>
     void
-    operator()(_OutRng&& __out, std::size_t __idx, const ValueType& __v) const
+    operator()(_OutRng&& __out_rng, std::size_t __idx, const ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(__v)>,
-                                                               std::decay_t<decltype(__out[__idx])>>::__type;
-        __out[__idx] = static_cast<_ConvertedTupleType>(__v);
+                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
+        __out_rng[__idx] = static_cast<_ConvertedTupleType>(__v);
     }
 };
 
@@ -840,15 +840,15 @@ struct __write_to_idx_if
 {
     template <typename _OutRng, typename _SizeType, typename ValueType>
     void
-    operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
+    operator()(_OutRng&& __out_rng, _SizeType __idx, const ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
-                                                               std::decay_t<decltype(__out[__idx])>>::__type;
+                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
         if (std::get<1>(__v))
-            __out[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v));
+            __out_rng[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v));
     }
 };
 

From 598f56997cbfc7cb1117f29891aa42655f6028ad Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 29 Jul 2024 12:36:54 -0400
Subject: [PATCH 18/88] fixing device copyable for helpers

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       | 29 +++++++++++++++++++
 .../device_copyable.pass.cpp                  | 29 +++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 2144c454864..a2bfef10fb0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -233,11 +233,40 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__internal::
 namespace oneapi::dpl::__par_backend_hetero
 {
 
+template <typename _UnaryOp>
+struct __gen_transform_input;
+
+template <typename _Predicate>
+struct __gen_count_pred;
+
+template <typename _Predicate>
+struct __gen_expand_count_pred;
+
 template <typename _ExecutionPolicy, typename _Pred>
 struct __early_exit_find_or;
 
 } // namespace oneapi::dpl::__par_backend_hetero
 
+template <typename _UnaryOp>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_transform_input,
+                                                       _UnaryOp)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_UnaryOp>
+{
+};
+
+template <typename _Predicate>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_pred, _Predicate)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
+{
+};
+
+template <typename _Predicate>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred,
+                                                       _Predicate)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
+{
+};
+
 template <typename _ExecutionPolicy, typename _Pred>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or,
                                                        _ExecutionPolicy, _Pred)>
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index f97a23de365..c282e3e8789 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -145,6 +145,20 @@ test_device_copyable()
                       oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable>>,
                   "__brick_reduce_idx is not device copyable with device copyable types");
 
+    //__gen_transform_input
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_device_copyable>>,
+        "__gen_transform_input is not device copyable with device copyable types");
+
+    //__gen_count_pred
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_pred<noop_device_copyable>>,
+                  "__gen_count_pred is not device copyable with device copyable types");
+
+    //__gen_expand_count_pred
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<noop_device_copyable>>,
+        "__gen_expand_count_pred is not device copyable with device copyable types");
+
     // __early_exit_find_or
     static_assert(
         sycl::is_device_copyable_v<
@@ -343,6 +357,21 @@ test_non_device_copyable()
                       oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
+    // //__gen_transform_input
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_non_device_copyable>>,
+        "__gen_transform_input is device copyable with non device copyable types");
+
+    //__gen_count_pred
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_pred<noop_non_device_copyable>>,
+        "__gen_count_pred is device copyable with non device copyable types");
+
+    //__gen_expand_count_pred
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<noop_non_device_copyable>>,
+                  "__gen_expand_count_pred is device copyable with non device copyable types");
+
     // __early_exit_find_or
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__early_exit_find_or<policy_non_device_copyable,

From 96b4fd2057a79e2169793b1135757914aea53e63 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 31 Jul 2024 10:30:47 -0500
Subject: [PATCH 19/88] Remove commented code that remained after rebase

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp    | 2 --
 .../numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
index 51b02c0c43e..ea41167a55f 100644
--- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
@@ -256,7 +256,6 @@ main()
 
     // TODO: Investigate why -fno-fast-math flag causes failures in icpx 2024.2.0 + CPU + Release build with these tests. Additionally, verify
     // if we should stop testing with std::complex as it is not officially supported in SYCL kernels.
-#if 0
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;
@@ -277,7 +276,6 @@ main()
         test_algo_three_sequences<ValueType, test_exclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
-#endif
 
     return TestUtils::done();
 }
diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
index c52c47feee0..344f4f93834 100644
--- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
@@ -221,9 +221,6 @@ main()
         test_algo_three_sequences<ValueType, test_inclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
-    // TODO: Investigate why -fno-fast-math flag causes failures in icpx 2024.2.0 + CPU + Release build with these tests. Additionally, verify
-    // if we should stop testing with std::complex as it is not officially supported in SYCL kernels.
-#if 0
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;
@@ -244,7 +241,6 @@ main()
         test_algo_three_sequences<ValueType, test_inclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
-#endif
 
     return TestUtils::done();
 }

From 8f759a3c670c39c0644792e6e383310e980417b2 Mon Sep 17 00:00:00 2001
From: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
Date: Wed, 31 Jul 2024 16:58:55 -0500
Subject: [PATCH 20/88] [PROTOTYPE] Add fallback to legacy scan implementation
 for CPU devices and devices that lack size 32 sub-groups (#1749)

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
Co-authored-by: Adam Fidel <adam.fidel@intel.com>
Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Co-authored-by: Adam Fidel <110841220+adamfidel@users.noreply.github.com>
Co-authored-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 51 +++++++++++++++----
 .../dpcpp/parallel_backend_sycl_utils.h       | 10 ++++
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5200eb6f5d8..dfaa802f8b6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -882,13 +882,17 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                     ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-        oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
-        return __parallel_transform_reduce_then_scan(
-                            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-                            std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
-                            oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
+        const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
+        // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
+        if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
+        {
+            oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
+            return __parallel_transform_reduce_then_scan(
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
+                oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
+        }
     }
-    else
     {
         using _Assigner = unseq_backend::__scan_assigner;
         using _NoAssign = unseq_backend::__scan_no_assign;
@@ -899,8 +903,13 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         _NoAssign __no_assign_op;
         _NoOpFunctor __get_data_op;
 
+        // Although we do not actually need result storage in this case, we need to construct
+        // a placeholder here to match the return type of reduce-then-scan
+        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
+        _TempStorage __dummy_result_and_scratch{__exec, 0};
+
         return
-            __parallel_transform_scan_base(
+            __future(__parallel_transform_scan_base(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
                 std::forward<_Range2>(__out_rng), __binary_op, __init,
                 // local scan
@@ -912,7 +921,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                                       _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
                     __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
                 // global scan
-                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
+                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(),
+                    __dummy_result_and_scratch);
     }
 }
 
@@ -1021,7 +1031,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 
     constexpr ::std::uint16_t __single_group_upper_limit = 2048;
 
-    ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
 
     if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
         __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
@@ -1032,9 +1043,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
             _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
             ::std::forward<_OutRng>(__out_rng), __pred);
     }
-    else
+    // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
+    else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
     {
-        using _ReduceOp = ::std::plus<_Size>;
+        using _ReduceOp = std::plus<_Size>;
 
         return __parallel_transform_reduce_then_scan(
             __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
@@ -1045,6 +1057,23 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
             oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
             /*_Inclusive=*/std::true_type{});
     }
+    else
+    {
+        using _ReduceOp = std::plus<_Size>;
+        using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+        using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
+                                                     /*inclusive*/ std::true_type, 1>;
+        // Although we do not actually need result storage in this case, we need to construct
+        // a placeholder here to match the return type of reduce-then-scan
+        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Size>;
+        _TempStorage __dummy_result_and_scratch{__exec, 0};
+
+        return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                             std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                             CreateOp{__pred}, CopyOp{})
+                            .event(),
+                        __dummy_result_and_scratch);
+    }
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 663a7edfa47..340b5fda68b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <type_traits>
 #include <tuple>
+#include <algorithm>
 
 #include "../../iterator_impl.h"
 
@@ -814,6 +815,15 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
+template <typename _ExecutionPolicy>
+bool
+__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
+{
+    const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
+    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) !=
+           __subgroup_sizes.end();
+}
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi

From 6da54e786c3af50817aaf2cc10861451f8b6b8be Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Date: Fri, 2 Aug 2024 17:03:29 -0400
Subject: [PATCH 21/88] [PROTOTYPE] partition, unique families and ranges API
 (#1708)

Enabling partition and unique family of scan-like algorithms includes ranges API
Making legacy scan user `__result_and_scratch_storage` to match future type for return to compile
Refactoring of `__pattern` / `__parallel` structure for scan-like algorithms for consistency

---------

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  80 +++---
 .../hetero/algorithm_ranges_impl_hetero.h     |  92 ++----
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 267 +++++++++++++-----
 .../parallel_backend_sycl_reduce_then_scan.h  |  88 ++++--
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  54 +++-
 .../pstl/hetero/numeric_ranges_impl_hetero.h  |  34 +--
 .../device_copyable.pass.cpp                  |  67 ++++-
 test/support/utils_device_copyable.h          |  61 ++++
 8 files changed, 489 insertions(+), 254 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 4334fb1d51c..ff1dd010011 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -886,33 +886,6 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat
 // copy_if
 //------------------------------------------------------------------------
 
-template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _IteratorOrTuple,
-          typename _CreateMaskOp, typename _CopyByMaskOp>
-::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type>
-__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last,
-                    _IteratorOrTuple __output_first, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op)
-{
-    using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
-
-    if (__first == __last)
-        return ::std::make_pair(__output_first, _It1DifferenceType{0});
-
-    _It1DifferenceType __n = __last - __first;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
-    auto __buf1 = __keep1(__first, __last);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>();
-    auto __buf2 = __keep2(__output_first, __output_first + __n);
-
-    auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                            __buf1.all_view(), __buf2.all_view(), __n, __create_mask_op,
-                                                            __copy_by_mask_op);
-
-    ::std::size_t __num_copied = __res.get();
-    return ::std::make_pair(__output_first + __n, __num_copied);
-}
-
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
           typename _Predicate>
 _Iterator2
@@ -952,19 +925,24 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
         return ::std::make_pair(__result1, __result2);
 
     using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
-    using _ReduceOp = ::std::plus<_It1DifferenceType>;
 
-    unseq_backend::__create_mask<_UnaryPredicate, _It1DifferenceType> __create_mask_op{__pred};
-    unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __copy_by_mask_op{_ReduceOp{}};
+    _It1DifferenceType __n = __last - __first;
 
-    auto __result = __pattern_scan_copy(
-        __tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
-        __par_backend_hetero::zip(
-            __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1),
-            __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)),
-        __create_mask_op, __copy_by_mask_op);
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
+    auto __buf1 = __keep1(__first, __last);
+
+    auto __zipped_res = __par_backend_hetero::zip(
+        __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1),
+        __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2));
 
-    return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second));
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>();
+    auto __buf2 = __keep2(__zipped_res, __zipped_res + __n);
+
+    auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred);
+
+    return std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get()));
 }
 
 //------------------------------------------------------------------------
@@ -978,16 +956,28 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
                       _Iterator2 __result_first, _BinaryPredicate __pred)
 {
     using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
-    unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, oneapi::dpl::__internal::__pstl_assign,
-                                  /*inclusive*/ ::std::true_type, 1>
-        __copy_by_mask_op;
-    __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{
-        __not_pred<_BinaryPredicate>{__pred}};
 
-    auto __result = __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
-                                        __result_first, __create_mask_op, __copy_by_mask_op);
+    _It1DifferenceType __n = __last - __first;
+
+    if (__n == 0)
+        return __result_first;
+    if (__n == 1)
+    {
+        oneapi::dpl::__internal::__pattern_walk2_brick(
+            __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first,
+            oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{});
+        return __result_first + 1;
+    }
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
+    auto __buf1 = __keep1(__first, __last);
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
+    auto __buf2 = __keep2(__result_first, __result_first + __n);
+
+    auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred);
 
-    return __result_first + __result.second;
+    return __result_first + __result.get();
 }
 
 template <typename _Name>
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 1df37f8acbc..e301863bb2d 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -334,67 +334,21 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _
 // copy_if
 //------------------------------------------------------------------------
 
-template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _CreateMaskOp,
-          typename _CopyByMaskOp>
-oneapi::dpl::__internal::__difference_t<_Range1>
-__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
-                    _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op)
-{
-    if (__rng1.size() == 0)
-        return __rng1.size();
-
-    using _SizeType = decltype(__rng1.size());
-    using _ReduceOp = ::std::plus<_SizeType>;
-    using _Assigner = unseq_backend::__scan_assigner;
-    using _NoAssign = unseq_backend::__scan_no_assign;
-    using _MaskAssigner = unseq_backend::__mask_assigner<1>;
-    using _InitType = unseq_backend::__no_init_value<_SizeType>;
-    using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
-
-    _Assigner __assign_op;
-    _ReduceOp __reduce_op;
-    _DataAcc __get_data_op;
-    _MaskAssigner __add_mask_op;
-
-    oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __rng1.size());
-
-    auto __res =
-        __par_backend_hetero::__parallel_transform_scan_base(
-            _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-            oneapi::dpl::__ranges::zip_view(
-                __rng1, oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
-                            __mask_buf.get_buffer())),
-            __rng2, __reduce_op, _InitType{},
-            // local scan
-            unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
-                                  _MaskAssigner, _CreateMaskOp, _InitType>{__reduce_op, __get_data_op, __assign_op,
-                                                                           __add_mask_op, __create_mask_op},
-            // scan between groups
-            unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
-                                  _Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
-                                                                  __get_data_op},
-            // global scan
-            __copy_by_mask_op)
-            .get();
-
-    return __res;
-}
-
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Predicate,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
-                  _Predicate __pred, _Assign)
+                  _Predicate __pred, _Assign&& __assign)
 {
-    using _SizeType = decltype(__rng1.size());
-    using _ReduceOp = ::std::plus<_SizeType>;
+    auto __n = __rng1.size();
+    if (__n == 0)
+        return 0;
 
-    unseq_backend::__create_mask<_Predicate, _SizeType> __create_mask_op{__pred};
-    unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op;
+    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
+        std::forward<_Range2>(__rng2), __n, __pred, std::forward<_Assign>(__assign));
 
-    return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec),
-                                         ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2),
-                                         __create_mask_op, __copy_by_mask_op);
+    return __res.get(); //is a blocking call
 }
 
 //------------------------------------------------------------------------
@@ -433,17 +387,27 @@ template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typ
           typename _BinaryPredicate, typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result,
-                      _BinaryPredicate __pred, _Assign)
+                      _BinaryPredicate __pred, _Assign&& __assign)
 {
-    using _It1DifferenceType = oneapi::dpl::__internal::__difference_t<_Range1>;
-    unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, _Assign, /*inclusive*/ ::std::true_type, 1>
-        __copy_by_mask_op;
-    __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{
-        __not_pred<_BinaryPredicate>{__pred}};
-
-    return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec),
-                                         ::std::forward<_Range1>(__rng), ::std::forward<_Range2>(__result),
-                                         __create_mask_op, __copy_by_mask_op);
+    auto __n = __rng.size();
+    if (__n == 0)
+        return 0;
+    if (__n == 1)
+    {
+        using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+            unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng),
+            std::forward<_Range2>(__result))
+            .get();
+
+        return 1;
+    }
+
+    return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(
+               _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+               std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign))
+        .get();
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index dfaa802f8b6..00c0602bdba 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -512,8 +512,8 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
                         }
                     }
 
-                    __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n,
-                                                              __lacc_ptr, __bin_op, __init);
+                    __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, __lacc_ptr,
+                                                              __bin_op, __init);
 
                     if constexpr (__can_use_subgroup_load_store)
                     {
@@ -555,10 +555,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                                                         __internal::__optional_kernel_name<_ScanKernelName...>>
 {
     template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
-              typename _UnaryOp>
+              typename _UnaryOp, typename _Assign>
     auto
     operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
-               _BinaryOperation __bin_op, _UnaryOp __unary_op)
+               _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign)
     {
         using _ValueType = ::std::uint16_t;
 
@@ -617,12 +617,13 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
 
                     __scan_work_group<_ValueType, /* _Inclusive */ false>(
                         __group, __lacc_ptr, __lacc_ptr + __elems_per_wg, __lacc_ptr + __elems_per_wg, __bin_op,
-                         __init);
+                        __init);
 
                     for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
                     {
                         if (__lacc[__idx])
-                            __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
+                            __assign(static_cast<__tuple_type>(__in_rng[__idx]),
+                                     __out_rng[__lacc[__idx + __elems_per_wg]]);
                     }
 
                     const ::std::uint16_t __residual = __n % _WGSize;
@@ -631,7 +632,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                     {
                         auto __idx = __residual_start + __item_id;
                         if (__lacc[__idx])
-                            __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
+                            __assign(static_cast<__tuple_type>(__in_rng[__idx]),
+                                     __out_rng[__lacc[__idx + __elems_per_wg]]);
                     }
 
                     if (__item_id == 0)
@@ -774,7 +776,8 @@ struct __gen_transform_input
     operator()(InRng&& __in_rng, std::size_t __idx) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<InRng>;
-        using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<typename std::invoke_result<_UnaryOp, _ValueType>::type>;
+        using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<
+            typename std::invoke_result<_UnaryOp, _ValueType>::type>;
         return _OutValueType{__unary_op(__in_rng[__idx])};
     }
     _UnaryOp __unary_op;
@@ -796,19 +799,44 @@ struct __simple_write_to_idx
 };
 
 template <typename _Predicate>
-struct __gen_count_pred
+struct __gen_mask
+{
+    template <typename _InRng>
+    bool
+    operator()(_InRng&& __in_rng, std::size_t __idx) const
+    {
+        return __pred(__in_rng[__idx]);
+    }
+    _Predicate __pred;
+};
+
+template <typename _BinaryPredicate>
+struct __gen_unique_mask
+{
+    template <typename _InRng>
+    bool
+    operator()(_InRng&& __in_rng, std::size_t __idx) const
+    {
+        //starting index is offset to 1 for "unique" patterns and 0th element copy is handled separately
+        return !__pred(__in_rng[__idx], __in_rng[__idx - 1]);
+    }
+    _BinaryPredicate __pred;
+};
+
+template <typename _GenMask>
+struct __gen_count_mask
 {
     template <typename _InRng, typename _SizeType>
     _SizeType
     operator()(_InRng&& __in_rng, _SizeType __idx) const
     {
-        return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0};
+        return __gen_mask(std::forward<_InRng>(__in_rng), __idx) ? _SizeType{1} : _SizeType{0};
     }
-    _Predicate __pred;
+    _GenMask __gen_mask;
 };
 
-template <typename _Predicate>
-struct __gen_expand_count_pred
+template <typename _GenMask>
+struct __gen_expand_count_mask
 {
     template <typename _InRng, typename _SizeType>
     auto
@@ -820,10 +848,10 @@ struct __gen_expand_count_pred
         using _ElementType =
             oneapi::dpl::__internal::__decay_with_tuple_specialization_t<oneapi::dpl::__internal::__value_t<_InRng>>;
         _ElementType ele = __in_rng[__idx];
-        bool mask = __pred(ele);
+        bool mask = __gen_mask(__in_rng, __idx);
         return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
     }
-    _Predicate __pred;
+    _GenMask __gen_mask;
 };
 
 struct __get_zeroth_element
@@ -835,7 +863,7 @@ struct __get_zeroth_element
         return std::get<0>(std::forward<_Tp>(__a));
     }
 };
-
+template <std::int32_t __offset = 0, typename Assign = oneapi::dpl::__internal::__pstl_assign>
 struct __write_to_idx_if
 {
     template <typename _OutRng, typename _SizeType, typename ValueType>
@@ -848,8 +876,27 @@ struct __write_to_idx_if
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
                                                                std::decay_t<decltype(__out_rng[__idx])>>::__type;
         if (std::get<1>(__v))
-            __out_rng[std::get<0>(__v) - 1] = static_cast<_ConvertedTupleType>(std::get<2>(__v));
+            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1 + __offset]);
     }
+    Assign __assign;
+};
+
+template <typename Assign = oneapi::dpl::__internal::__pstl_assign>
+struct __write_to_idx_if_else
+{
+    template <typename _OutRng, typename _SizeType, typename ValueType>
+    void
+    operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
+    {
+        using _ConvertedTupleType =
+            typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
+                                                               std::decay_t<decltype(__out[__idx])>>::__type;
+        if (std::get<1>(__v))
+            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out[std::get<0>(__v) - 1]));
+        else
+            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<1>(__out[__idx - std::get<0>(__v)]));
+    }
+    Assign __assign;
 };
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
@@ -882,15 +929,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                     ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-        const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
-        // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
-        if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
+        if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
         {
-            oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
+            using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
+            using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
+            using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_idx;
+
+            _GenInput __gen_transform{__unary_op};
+
             return __parallel_transform_reduce_then_scan(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
-                oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{});
+                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
+                _WriteOp{}, __init, _Inclusive{},
+                /*_IsUniquePattern=*/std::false_type{});
         }
     }
     {
@@ -903,26 +954,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         _NoAssign __no_assign_op;
         _NoOpFunctor __get_data_op;
 
-        // Although we do not actually need result storage in this case, we need to construct
-        // a placeholder here to match the return type of reduce-then-scan
-        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
-        _TempStorage __dummy_result_and_scratch{__exec, 0};
-
-        return
-            __future(__parallel_transform_scan_base(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-                std::forward<_Range2>(__out_rng), __binary_op, __init,
-                // local scan
-                unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner,
-                                      _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op},
-                                                                          __assign_op, __assign_op, __get_data_op},
-                // scan between groups
-                unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                                      _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-                    __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-                // global scan
-                unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}).event(),
-                    __dummy_result_and_scratch);
+        return __parallel_transform_scan_base(
+            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+            std::forward<_Range2>(__out_rng), __binary_op, __init,
+            // local scan
+            unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
+                                  _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
+                                                           __assign_op, __get_data_op},
+            // scan between groups
+            unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
+                                  _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+                __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
+            // global scan
+            unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
     }
 }
 
@@ -932,9 +976,11 @@ struct __invoke_single_group_copy_if
     // Specialization for devices that have a max work-group size of at least 1024
     static constexpr ::std::uint16_t __targeted_wg_size = 1024;
 
-    template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred>
+    template <std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
+              typename _Assign = oneapi::dpl::__internal::__pstl_assign>
     auto
-    operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred)
+    operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred,
+               _Assign&& __assign)
     {
         constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
         constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
@@ -953,7 +999,8 @@ struct __invoke_single_group_copy_if
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, true, _FullKernelName>()(
                 std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred));
+                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred),
+                std::forward<_Assign>(__assign));
         }
         else
         {
@@ -966,11 +1013,31 @@ struct __invoke_single_group_copy_if
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, false, _NonFullKernelName>()(
                 std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred));
+                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred),
+                std::forward<_Assign>(__assign));
         }
     }
 };
 
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
+          typename _WriteOp, typename _IsUniquePattern>
+auto
+__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                                 _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask,
+                                 _WriteOp __write_op, _IsUniquePattern __is_unique_pattern)
+{
+    using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>;
+    using _ReduceOp = std::plus<_Size>;
+    using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>;
+    using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;
+
+    return __parallel_transform_reduce_then_scan(
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
+        std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask},
+        _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
+        /*_Inclusive=*/std::true_type{}, __is_unique_pattern);
+}
+
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
           typename _CopyByMaskOp>
 auto
@@ -978,7 +1045,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
                      _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
                      _CopyByMaskOp __copy_by_mask_op)
 {
-    using _ReduceOp = ::std::plus<_Size>;
+    using _ReduceOp = std::plus<_Size>;
     using _Assigner = unseq_backend::__scan_assigner;
     using _NoAssign = unseq_backend::__scan_no_assign;
     using _MaskAssigner = unseq_backend::__mask_assigner<1>;
@@ -995,7 +1062,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
 
     return __parallel_transform_scan_base(
         __backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
-        oneapi::dpl::__ranges::make_zip_view(
+        oneapi::dpl::__ranges::zip_view(
             ::std::forward<_InRng>(__in_rng),
             oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
                 __mask_buf.get_buffer())),
@@ -1012,10 +1079,72 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
         __copy_by_mask_op);
 }
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred>
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate,
+          typename _Assign = oneapi::dpl::__internal::__pstl_assign>
+auto
+__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                       _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred,
+                       _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{})
+{
+
+    auto __n = __rng.size();
+    if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
+    {
+        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
+        using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1, _Assign>;
+
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
+                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)},
+                                                /*_IsUniquePattern=*/std::true_type{});
+    }
+    else
+    {
+        unseq_backend::__copy_by_mask<std::plus<decltype(__n)>, oneapi::dpl::__internal::__pstl_assign,
+                                      /*inclusive*/ std::true_type, 1>
+            __copy_by_mask_op;
+        oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
+                                                           decltype(__n)>
+            __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}};
+
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                    std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op);
+    }
+}
+
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
+auto
+__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                          _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
+{
+    auto __n = __rng.size();
+    if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
+    {
+        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
+        using _WriteOp =
+            oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<oneapi::dpl::__internal::__pstl_assign>;
+
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
+                                                _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{});
+    }
+    else
+    {
+        using _ReduceOp = std::plus<decltype(__n)>;
+
+        unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred};
+        unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type> __partition_by_mask{_ReduceOp{}};
+
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                    std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask);
+    }
+}
+
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
+          typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 auto
 __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                   _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred)
+                   _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign&& __assign = _Assign{})
 {
     using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
 
@@ -1032,7 +1161,6 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
     constexpr ::std::uint16_t __single_group_upper_limit = 2048;
 
     std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
-    const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
 
     if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
         __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
@@ -1040,22 +1168,17 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
         using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
 
         return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
-            _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
-            ::std::forward<_OutRng>(__out_rng), __pred);
+            _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
+            std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign));
     }
-    // Reduce-then-scan performs poorly on CPUs due to sub-group operations.
-    else if (!__exec.queue().get_device().is_cpu() && __dev_has_sg32)
+    else if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
     {
-        using _ReduceOp = std::plus<_Size>;
+        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
+        using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>;
 
-        return __parallel_transform_reduce_then_scan(
-            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-            std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Pred>{__pred},
-            _ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Pred>{__pred},
-            oneapi::dpl::__par_backend_hetero::__get_zeroth_element{},
-            oneapi::dpl::__par_backend_hetero::__write_to_idx_if{},
-            oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
-            /*_Inclusive=*/std::true_type{});
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                                _GenMask{__pred}, _WriteOp{}, /*Unique=*/std::false_type{});
     }
     else
     {
@@ -1063,16 +1186,10 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
         using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
         using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
                                                      /*inclusive*/ std::true_type, 1>;
-        // Although we do not actually need result storage in this case, we need to construct
-        // a placeholder here to match the return type of reduce-then-scan
-        using _TempStorage = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Size>;
-        _TempStorage __dummy_result_and_scratch{__exec, 0};
-
-        return __future(__parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                             std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                             CreateOp{__pred}, CopyOp{})
-                            .event(),
-                        __dummy_result_and_scratch);
+
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                    CreateOp{__pred}, CopyOp{});
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d216e8e36ac..8ac40cfa91a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -266,13 +266,15 @@ template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename _KernelName>
+          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
+          typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename... _KernelName>
+          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
+          typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
-                                                    _GenReduceInput, _ReduceOp, _InitType,
+                                                    __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType,
                                                     __internal::__optional_kernel_name<_KernelName...>>
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
@@ -302,7 +304,11 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
-
+                if constexpr (__is_unique_pattern_v)
+                {
+                    // for unique patterns, the first element is always copied to the output, so we need to skip it
+                    __group_start_idx += 1;
+                }
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
                 std::uint32_t __active_subgroups =
@@ -400,16 +406,16 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 };
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
-          typename _WriteOp, typename _InitType, typename _KernelName>
+          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
+          typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
-          typename _WriteOp, typename _InitType, typename... _KernelName>
+          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
+          typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter<
-    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput,
-    _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
+    __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp,
+    _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
 
     template <typename _TmpPtr>
@@ -456,6 +462,11 @@ struct __parallel_reduce_then_scan_scan_submitter<
 
                 auto __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
+                if constexpr (__is_unique_pattern_v)
+                {
+                    // for unique patterns, the first element is always copied to the output, so we need to skip it
+                    __group_start_idx += 1;
+                }
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
@@ -609,8 +620,17 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
-                    else
+                    else // zeroth block, group and subgroup
                     {
+                        if constexpr (__is_unique_pattern_v)
+                        {
+                            if (__sub_group_local_id == 0)
+                            {
+                                // For unique patterns, always copy the 0th element to the output
+                                __write_op.__assign(__in_rng[0], __out_rng[0]);
+                            }
+                        }
+
                         if constexpr (std::is_same_v<_InitType,
                                                      oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>)
                         {
@@ -673,7 +693,14 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 {
                     if (__block_num + 1 == __num_blocks)
                     {
-                        __res_ptr[0] = __sub_group_carry.__v;
+                        if constexpr (__is_unique_pattern_v)
+                        {
+                            __res_ptr[0] = __sub_group_carry.__v + 1;
+                        }
+                        else
+                        {
+                            __res_ptr[0] = __sub_group_carry.__v;
+                        }
                     }
                     else
                     {
@@ -681,7 +708,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         __set_block_carry_out(__block_num, __tmp_ptr, __sub_group_carry.__v);
                     }
                 }
-
                 __sub_group_carry.__destroy();
             });
         });
@@ -702,6 +728,16 @@ struct __parallel_reduce_then_scan_scan_submitter<
     _InitType __init;
 };
 
+// reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup
+// operations.  We do not want to run this can on CPU targets, as they are not performant with this algorithm.
+template <typename _ExecutionPolicy>
+bool
+__prefer_reduce_then_scan(const _ExecutionPolicy& __exec)
+{
+    const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
+    return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32);
+}
+
 // General scan-like algorithm helpers
 // _GenReduceInput - a function which accepts the input range and index to generate the data needed by the main output
 //                   used in the reduction operation (to calculate the global carries)
@@ -714,13 +750,13 @@ struct __parallel_reduce_then_scan_scan_submitter<
 //            and performs the final write to output operation
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _GenReduceInput, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
-          typename _Inclusive>
+          typename _Inclusive, typename _IsUniquePattern>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
                                       _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input,
                                       _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,
-                                      _ScanInputTransform __scan_input_transform, _WriteOp __write_op,
-                                      _InitType __init /*TODO mask assigners for generalization go here*/, _Inclusive)
+                                      _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init,
+                                      _Inclusive, _IsUniquePattern)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
     using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -733,6 +769,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;
+    constexpr bool __is_unique_pattern_v = _IsUniquePattern::value;
 
     // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
     // that no check is needed?
@@ -747,14 +784,22 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const std::size_t __n = __in_rng.size();
     const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
     std::size_t __num_remaining = __n;
+    if constexpr (__is_unique_pattern_v)
+    {
+        // skip scan of zeroth element in unique patterns
+        __num_remaining -= 1;
+    }
+    // reduce_then_scan kernel is not built to handle "empty" scans which includes `__n == 1` for unique patterns.
+    // These trivial end cases should be handled at a higher level.
+    assert(__num_remaining > 0);
     auto __inputs_per_sub_group =
-        __n >= __max_inputs_per_block
+        __num_remaining >= __max_inputs_per_block
             ? __max_inputs_per_block / __num_sub_groups_global
             : std::max(__sub_group_size,
                        oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
     auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const auto __block_size = (__n < __max_inputs_per_block) ? __n : __max_inputs_per_block;
-    const auto __num_blocks = __n / __block_size + (__n % __block_size != 0);
+    const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block;
+    const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0);
 
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
@@ -764,10 +809,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =
-        __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+        __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v,
                                                      _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>;
     using _ScanSubmitter =
-        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v,
                                                    _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform,
                                                    _WriteOp, _InitType, _ScanKernel>;
     // TODO: remove below before merging. used for convenience now
@@ -797,6 +842,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
         __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
                                    __inputs_per_sub_group, __inputs_per_item, __b);
+
         if (__num_remaining > __block_size)
         {
             // Resize for the next block.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index a2bfef10fb0..9a935152446 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -237,10 +237,22 @@ template <typename _UnaryOp>
 struct __gen_transform_input;
 
 template <typename _Predicate>
-struct __gen_count_pred;
+struct __gen_mask;
 
-template <typename _Predicate>
-struct __gen_expand_count_pred;
+template <typename _BinaryPredicate>
+struct __gen_unique_mask;
+
+template <typename _GenMask>
+struct __gen_count_mask;
+
+template <typename _GenMask>
+struct __gen_expand_count_mask;
+
+template <int32_t __offset, typename Assign>
+struct __write_to_idx_if;
+
+template <typename Assign>
+struct __write_to_idx_if_else;
 
 template <typename _ExecutionPolicy, typename _Pred>
 struct __early_exit_find_or;
@@ -255,15 +267,41 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 };
 
 template <typename _Predicate>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_pred, _Predicate)>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
 {
 };
 
-template <typename _Predicate>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred,
-                                                       _Predicate)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
+template <typename _BinaryPredicate>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask,
+                                                       _BinaryPredicate)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPredicate>
+{
+};
+template <typename _GenMask>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
+{
+};
+
+template <typename _GenMask>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask,
+                                                       _GenMask)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
+{
+};
+
+template <int32_t __offset, typename Assign>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset,
+                                                       Assign)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<Assign>
+{
+};
+
+template <typename Assign>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else,
+                                                       Assign)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<Assign>
 {
 };
 
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
index f85dc811ab2..831b4fdf1f4 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
@@ -91,35 +91,15 @@ oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
                               _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive)
 {
-    if (__rng1.empty())
+    auto __n = __rng1.size();
+    if (__n == 0)
         return 0;
-    oneapi::dpl::__internal::__difference_t<_Range2> __rng1_size = __rng1.size();
-
-    using _Type = typename _InitType::__value_type;
-    using _Assigner = unseq_backend::__scan_assigner;
-    using _NoAssign = unseq_backend::__scan_no_assign;
-    using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
-    using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
-
-    _Assigner __assign_op;
-    _NoAssign __no_assign_op;
-    _NoOpFunctor __get_data_op;
-
-    oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_base(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1),
-        ::std::forward<_Range2>(__rng2), __binary_op, __init,
-        // local scan
-        unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
-                              _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
-                                                       __get_data_op},
-        // scan between groups
-        unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                              _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-            __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-        // global scan
-        unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init})
+
+    oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
+        std::forward<_Range2>(__rng2), __n, __unary_op, __init, __binary_op, _Inclusive{})
         .__deferrable_wait();
-    return __rng1_size;
+    return __n;
 }
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation,
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index c282e3e8789..25f5fc2e608 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -150,14 +150,34 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_device_copyable>>,
         "__gen_transform_input is not device copyable with device copyable types");
 
-    //__gen_count_pred
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_pred<noop_device_copyable>>,
-                  "__gen_count_pred is not device copyable with device copyable types");
+    //__gen_mask
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
+                  "__gen_mask is not device copyable with device copyable types");
 
-    //__gen_expand_count_pred
+    //__gen_unique_mask
     static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<noop_device_copyable>>,
-        "__gen_expand_count_pred is not device copyable with device copyable types");
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_unique_mask<binary_op_device_copyable>>,
+        "__gen_unique_mask is not device copyable with device copyable types");
+
+    //__gen_count_mask
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
+                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>>,
+                  "__gen_count_mask is not device copyable with device copyable types");
+
+    //__gen_expand_count_mask
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<
+                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>>,
+                  "__gen_expand_count_mask is not device copyable with device copyable types");
+
+    //__write_to_idx_if
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_device_copyable>>,
+        "__write_to_idx_if is not device copyable with device copyable types");
+
+    //__write_to_idx_if_else
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<assign_device_copyable>>,
+        "__write_to_idx_if_else is not device copyable with device copyable types");
 
     // __early_exit_find_or
     static_assert(
@@ -357,20 +377,39 @@ test_non_device_copyable()
                       oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
-    // //__gen_transform_input
+    //__gen_transform_input
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_non_device_copyable>>,
         "__gen_transform_input is device copyable with non device copyable types");
 
-    //__gen_count_pred
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_pred<noop_non_device_copyable>>,
-        "__gen_count_pred is device copyable with non device copyable types");
+    //__gen_mask
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
+                  "__gen_mask is device copyable with non device copyable types");
+
+    //__gen_unique_mask
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_unique_mask<binary_op_non_device_copyable>>,
+                  "__gen_unique_mask is device copyable with non device copyable types");
+
+    //__gen_count_mask
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
+                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>>,
+                  "__gen_count_mask is device copyable with non device copyable types");
+
+    //__gen_expand_count_mask
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<
+                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>>,
+                  "__gen_expand_count_mask is device copyable with non device copyable types");
+
+    //__write_to_idx_if
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_non_device_copyable>>,
+                  "__write_to_idx_if is device copyable with non device copyable types");
 
-    //__gen_expand_count_pred
+    //__write_to_idx_if_else
     static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<noop_non_device_copyable>>,
-                  "__gen_expand_count_pred is device copyable with non device copyable types");
+                      oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<assign_non_device_copyable>>,
+                  "__write_to_idx_if_else is device copyable with non device copyable types");
 
     // __early_exit_find_or
     static_assert(
diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h
index 7b98de6501c..32e02991933 100644
--- a/test/support/utils_device_copyable.h
+++ b/test/support/utils_device_copyable.h
@@ -48,6 +48,57 @@ struct noop_non_device_copyable
     }
 };
 
+// Device copyable assignment callable.
+// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
+// relying on trivial copyability
+struct assign_non_device_copyable
+{
+    assign_non_device_copyable(const assign_non_device_copyable& other) { std::cout << "non trivial copy ctor\n"; }
+    template <typename _Xp, typename _Yp>
+    void
+    operator()(const _Xp& __x, _Yp&& __y) const
+    {
+        std::forward<_Yp>(__y) = __x;
+    }
+};
+
+struct assign_device_copyable
+{
+    assign_device_copyable(const assign_device_copyable& other) { std::cout << "non trivial copy ctor\n"; }
+    template <typename _Xp, typename _Yp>
+    void
+    operator()(const _Xp& __x, _Yp&& __y) const
+    {
+        std::forward<_Yp>(__y) = __x;
+    }
+};
+
+// Device copyable binary operator binary operators.
+// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
+// relying on trivial copyability
+struct binary_op_non_device_copyable
+{
+    binary_op_non_device_copyable(const binary_op_non_device_copyable& other)
+    {
+        std::cout << " non trivial copy ctor\n";
+    }
+    int
+    operator()(int a, int b) const
+    {
+        return a;
+    }
+};
+
+struct binary_op_device_copyable
+{
+    binary_op_device_copyable(const binary_op_device_copyable& other) { std::cout << " non trivial copy ctor\n"; }
+    int
+    operator()(int a, int b) const
+    {
+        return a;
+    }
+};
+
 // Device copyable int wrapper struct used in testing as surrogate for values, value types, etc.
 // Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
 // relying on trivial copyability
@@ -160,6 +211,16 @@ struct sycl::is_device_copyable<TestUtils::noop_device_copyable> : std::true_typ
 {
 };
 
+template <>
+struct sycl::is_device_copyable<TestUtils::assign_device_copyable> : std::true_type
+{
+};
+
+template <>
+struct sycl::is_device_copyable<TestUtils::binary_op_device_copyable> : std::true_type
+{
+};
+
 template <>
 struct sycl::is_device_copyable<TestUtils::int_device_copyable> : std::true_type
 {

From 13cecbf76499d3dd2781c846963772b489d5dc54 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Sun, 4 Aug 2024 10:30:48 -0400
Subject: [PATCH 22/88] fix windows issue regression __out

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 00c0602bdba..9e4f68ee688 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -886,15 +886,16 @@ struct __write_to_idx_if_else
 {
     template <typename _OutRng, typename _SizeType, typename ValueType>
     void
-    operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
+    operator()(_OutRng&& __out_rng, _SizeType __idx, const ValueType& __v) const
     {
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
-                                                               std::decay_t<decltype(__out[__idx])>>::__type;
+                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
         if (std::get<1>(__v))
-            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out[std::get<0>(__v) - 1]));
+            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out_rng[std::get<0>(__v) - 1]));
         else
-            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<1>(__out[__idx - std::get<0>(__v)]));
+            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)),
+                     std::get<1>(__out_rng[__idx - std::get<0>(__v)]));
     }
     Assign __assign;
 };

From 2daefab22fa7dd8b689832329769d945fb626c18 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Sun, 4 Aug 2024 11:24:41 -0400
Subject: [PATCH 23/88] fix for missing assigner in copy if pattern standardize
 setup for __parallel_scan_copy

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 9e4f68ee688..a87511e5a20 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1101,15 +1101,16 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
     }
     else
     {
-        unseq_backend::__copy_by_mask<std::plus<decltype(__n)>, oneapi::dpl::__internal::__pstl_assign,
-                                      /*inclusive*/ std::true_type, 1>
-            __copy_by_mask_op;
-        oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
-                                                           decltype(__n)>
-            __create_mask_op{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}};
+
+        using _ReduceOp = std::plus<decltype(__n)>;
+        using _CreateOp = oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
+                                                           decltype(__n)>;
+        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n, __create_mask_op, __copy_by_mask_op);
+                                    std::forward<_Range2>(__result), __n,
+                                    _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
+                                    _CopyOp{_ReduceOp{}, std::forward<_Assign>(__assign)});
     }
 }
 
@@ -1132,12 +1133,11 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
     else
     {
         using _ReduceOp = std::plus<decltype(__n)>;
-
-        unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)> __create_mask_op{__pred};
-        unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type> __partition_by_mask{_ReduceOp{}};
+        using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
+        using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n, __create_mask_op, __partition_by_mask);
+                                    std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
     }
 }
 
@@ -1179,18 +1179,19 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 
         return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
                                                 std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                                _GenMask{__pred}, _WriteOp{}, /*Unique=*/std::false_type{});
+                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)},
+                                                /*Unique=*/std::false_type{});
     }
     else
     {
         using _ReduceOp = std::plus<_Size>;
-        using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
-        using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
+        using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
                                                      /*inclusive*/ std::true_type, 1>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
                                     std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                    CreateOp{__pred}, CopyOp{});
+                                    _CreateOp{__pred}, _CopyOp{_ReduceOp{}, std::forward<_Assign>(__assign)});
     }
 }
 

From 4a83e1b031a4cd5197c064aee88d49972b8770b3 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Sun, 4 Aug 2024 15:43:16 -0400
Subject: [PATCH 24/88] fix unique same mangled name problem

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/algorithm_ranges_impl_hetero.h     | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index e301863bb2d..0bc7dcbb403 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -383,6 +383,9 @@ __pattern_remove_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec,
 // unique_copy
 //------------------------------------------------------------------------
 
+template <typename _Name>
+struct __copy_wrapper;
+
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2,
           typename _BinaryPredicate, typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
@@ -396,7 +399,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
     {
         using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
         oneapi::dpl::__par_backend_hetero::__parallel_for(
-            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+            _BackendTag{},
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
+                ::std::forward<_ExecutionPolicy>(__exec)),
             unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng),
             std::forward<_Range2>(__result))
             .get();
@@ -414,6 +419,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
 // unique
 //------------------------------------------------------------------------
 
+template <typename _Name>
+struct __unique_wrapper;
+
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range, typename _BinaryPredicate>
 oneapi::dpl::__internal::__difference_t<_Range>
 __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range&& __rng, _BinaryPredicate __pred)
@@ -425,10 +433,13 @@ __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra
 
     oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _ValueType> __buf(__exec, __rng.size());
     auto res_rng = oneapi::dpl::__ranges::views::all(__buf.get_buffer());
-    auto res = __ranges::__pattern_unique_copy(__tag, __exec, __rng, res_rng, __pred,
-                                               oneapi::dpl::__internal::__pstl_assign());
+    auto res = __ranges::__pattern_unique_copy(
+        __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__unique_wrapper>(__exec), __rng, res_rng, __pred,
+        oneapi::dpl::__internal::__pstl_assign());
 
-    __ranges::__pattern_walk_n(__tag, ::std::forward<_ExecutionPolicy>(__exec),
+    __ranges::__pattern_walk_n(__tag,
+                               oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
+                                   ::std::forward<_ExecutionPolicy>(__exec)),
                                __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}, res_rng,
                                ::std::forward<_Range>(__rng));
     return res;

From 299b28ba2780b492e49b30186e01c0f7535ff127 Mon Sep 17 00:00:00 2001
From: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:45:59 -0500
Subject: [PATCH 25/88] [PROTOTYPE] Cleanup reduce-then-scan code (#1760)

* Cleanup reduce-then-scan code

Signed-off-by: Matthew Michel <matthew.michel@intel.com>

* Remove old comment

Signed-off-by: Matthew Michel <matthew.michel@intel.com>

---------

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 71 ++++++++++---------
 .../exclusive_scan_by_segment.pass.cpp        |  2 -
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 8ac40cfa91a..d2f4cfa4aab 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -231,7 +231,7 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
                     __write_op(__out_rng, __start_idx, __v);
                 }
 
-                for (int __j = 1; __j < __iters - 1; __j++)
+                for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
                     auto __local_idx = __start_idx + __j * __sub_group_size;
                     __v = __gen_input(__in_rng, __local_idx);
@@ -331,9 +331,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
                     __sub_group_carry.__destroy();
                 }
-                // TODO: This is slower then ndi.barrier which was removed in SYCL2020. Can we do anything about it?
-                //sycl::group_barrier(ndi.get_group());
-                __ndi.barrier(sycl::access::fence_space::local_space);
+                __dpl_sycl::__group_barrier(__ndi);
 
                 // compute sub-group local prefix sums on (T0..63) carries
                 // and store to scratch space at the end of dst; next
@@ -364,7 +362,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                             __sub_group, __v, __reduce_op, __sub_group_carry);
                         __temp_ptr[__start_idx + __sub_group_local_id] = __v;
 
-                        for (int __i = 1; __i < __iters - 1; __i++)
+                        for (std::uint32_t __i = 1; __i < __iters - 1; __i++)
                         {
                             __v = __sub_group_partials[__i * __sub_group_size + __sub_group_local_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
@@ -542,7 +540,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                 __sub_group, __value.__v, __reduce_op, __carry_last);
 
                             // then some number of full iterations
-                            for (int __i = 1; __i < __pre_carry_iters - 1; __i++)
+                            for (std::uint32_t __i = 1; __i < __pre_carry_iters - 1; __i++)
                             {
                                 auto __reduction_idx = __i * __num_sub_groups_local * __sub_group_size +
                                                        __num_sub_groups_local * __sub_group_local_id + __offset;
@@ -567,9 +565,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     }
                 }
 
-                // N.B. barrier could be earlier, guarantees slm local carry update
-                //sycl::group_barrier(ndi.get_group());
-                __ndi.barrier(sycl::access::fence_space::local_space);
+                __dpl_sycl::__group_barrier(__ndi);
 
                 // steps 3/4) load global carry in from neighbor work-group
                 //            and apply to local sub-group prefix carries
@@ -599,8 +595,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 }
                 __value.__destroy();
 
-                //sycl::group_barrier(ndi.get_group());
-                __ndi.barrier(sycl::access::fence_space::local_space);
+                __dpl_sycl::__group_barrier(__ndi);
 
                 // Get inter-work group and adjusted for intra-work group prefix
                 bool __sub_group_carry_initialized = true;
@@ -771,12 +766,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     constexpr bool __inclusive = _Inclusive::value;
     constexpr bool __is_unique_pattern_v = _IsUniquePattern::value;
 
-    // TODO: Do we need to adjust for slm usage or is the amount we use reasonably small enough
-    // that no check is needed?
     // TODO: This min call is temporary until PR #1683 is merged.
     const std::size_t __work_group_size = std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
 
-    // TODO: base on max compute units. Recall disconnect in vendor definitions (# SMs vs. # XVEs)
+    // TODO: Investigate potentially basing this on some scale of the number of compute units. 128 work-groups has been
+    // found to be reasonable number for most devices.
     const std::size_t __num_work_groups = 128;
     const std::size_t __num_work_items = __num_work_groups * __work_group_size;
     const std::size_t __num_sub_groups_local = __work_group_size / __sub_group_size;
@@ -809,21 +803,33 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =
-        __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v,
-                                                     _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>;
+        __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                     __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType,
+                                                     _ReduceKernel>;
     using _ScanSubmitter =
-        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v,
-                                                   _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform,
-                                                   _WriteOp, _InitType, _ScanKernel>;
-    // TODO: remove below before merging. used for convenience now
-    // clang-format off
-    _ReduceSubmitter __reduce_submitter{__max_inputs_per_block, __num_sub_groups_local,
-        __num_sub_groups_global, __num_work_items, __n, __gen_reduce_input, __reduce_op, __init};
-    _ScanSubmitter __scan_submitter{__max_inputs_per_block, __num_sub_groups_local,
-        __num_sub_groups_global, __num_work_items, __num_blocks, __n, __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-        __write_op, __init};
-    // clang-format on
-
+        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
+                                                   __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _GenScanInput,
+                                                   _ScanInputTransform, _WriteOp, _InitType, _ScanKernel>;
+    _ReduceSubmitter __reduce_submitter{__max_inputs_per_block,
+                                        __num_sub_groups_local,
+                                        __num_sub_groups_global,
+                                        __num_work_items,
+                                        __n,
+                                        __gen_reduce_input,
+                                        __reduce_op,
+                                        __init};
+    _ScanSubmitter __scan_submitter{__max_inputs_per_block,
+                                    __num_sub_groups_local,
+                                    __num_sub_groups_global,
+                                    __num_work_items,
+                                    __num_blocks,
+                                    __n,
+                                    __gen_reduce_input,
+                                    __reduce_op,
+                                    __gen_scan_input,
+                                    __scan_input_transform,
+                                    __write_op,
+                                    __init};
     sycl::event __event;
     // Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
     // with sufficiently large L2 / L3 caches.
@@ -842,13 +848,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
         __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
                                    __inputs_per_sub_group, __inputs_per_item, __b);
-
-        if (__num_remaining > __block_size)
+        __num_remaining -= std::min(__num_remaining, __block_size);
+        // We only need to resize these parameters prior to the last block as it is the only non-full case.
+        if (__b + 2 == __num_blocks)
         {
-            // Resize for the next block.
-            __num_remaining -= __block_size;
-            // TODO: This recalculation really only matters for the second to last iteration
-            // of the loop since the last iteration is the only non-full block.
             __inputs_per_sub_group =
                 __num_remaining >= __max_inputs_per_block
                     ? __max_inputs_per_block / __num_sub_groups_global
diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
index ea41167a55f..a856f83afc7 100644
--- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
@@ -254,8 +254,6 @@ main()
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
 
-    // TODO: Investigate why -fno-fast-math flag causes failures in icpx 2024.2.0 + CPU + Release build with these tests. Additionally, verify
-    // if we should stop testing with std::complex as it is not officially supported in SYCL kernels.
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;

From 82668828322991db02e09b5526b53bf1edd85bed Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 11:15:46 -0400
Subject: [PATCH 26/88] restoring removed whitespace line

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
index 344f4f93834..1e955a729c3 100644
--- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
@@ -221,6 +221,7 @@ main()
         test_algo_three_sequences<ValueType, test_inclusive_scan_by_segment<BinaryPredicate, BinaryOperation>>();
 #endif // TEST_DPCPP_BACKEND_PRESENT
     }
+
     {
         using ValueType = MatrixPoint<float>;
         using BinaryPredicate = UserBinaryPredicate<ValueType>;

From 453d4ca8bcc9f3ee0842bc0171b6b591e1a0e4e1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 12:52:44 -0400
Subject: [PATCH 27/88] removing unnecessay storage type from kernel name

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index a87511e5a20..1e26b1148bc 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -696,7 +696,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
                     oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_single_wg_kernel<
                         ::std::integral_constant<::std::uint16_t, __wg_size>,
                         ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, _BinaryOperation,
-                        /* _IsFullGroup= */ ::std::false_type, _Inclusive, _TempStorage, _CustomName>>>()(
+                        /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()(
                     ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
                     std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
             return __future(__event, __dummy_result_and_scratch);

From 78e33ac1ac9321432bdc90ec3cb76733a097b0d8 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 11:22:57 -0400
Subject: [PATCH 28/88] remove unique pattern family from reduce_then_scan

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   | 55 +++++++----
 .../hetero/algorithm_ranges_impl_hetero.h     | 93 ++++++++++++-------
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 61 ++----------
 .../parallel_backend_sycl_reduce_then_scan.h  | 58 +++---------
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  9 --
 .../device_copyable.pass.cpp                  | 10 --
 test/support/utils_device_copyable.h          | 31 -------
 7 files changed, 114 insertions(+), 203 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index ff1dd010011..8f247791f38 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -886,6 +886,33 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat
 // copy_if
 //------------------------------------------------------------------------
 
+template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _IteratorOrTuple,
+          typename _CreateMaskOp, typename _CopyByMaskOp>
+::std::pair<_IteratorOrTuple, typename ::std::iterator_traits<_Iterator1>::difference_type>
+__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1 __last,
+                    _IteratorOrTuple __output_first, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op)
+{
+    using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
+
+    if (__first == __last)
+        return ::std::make_pair(__output_first, _It1DifferenceType{0});
+
+    _It1DifferenceType __n = __last - __first;
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
+    auto __buf1 = __keep1(__first, __last);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _IteratorOrTuple>();
+    auto __buf2 = __keep2(__output_first, __output_first + __n);
+
+    auto __res = __par_backend_hetero::__parallel_scan_copy(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                                                            __buf1.all_view(), __buf2.all_view(), __n, __create_mask_op,
+                                                            __copy_by_mask_op);
+
+    ::std::size_t __num_copied = __res.get();
+    return ::std::make_pair(__output_first + __n, __num_copied);
+}
+
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
           typename _Predicate>
 _Iterator2
@@ -956,28 +983,16 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
                       _Iterator2 __result_first, _BinaryPredicate __pred)
 {
     using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
+    unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, oneapi::dpl::__internal::__pstl_assign,
+                                  /*inclusive*/ ::std::true_type, 1>
+        __copy_by_mask_op;
+    __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{
+        __not_pred<_BinaryPredicate>{__pred}};
 
-    _It1DifferenceType __n = __last - __first;
-
-    if (__n == 0)
-        return __result_first;
-    if (__n == 1)
-    {
-        oneapi::dpl::__internal::__pattern_walk2_brick(
-            __hetero_tag<_BackendTag>{}, std::forward<_ExecutionPolicy>(__exec), __first, __last, __result_first,
-            oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{});
-        return __result_first + 1;
-    }
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
-    auto __buf1 = __keep1(__first, __last);
-    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
-    auto __buf2 = __keep2(__result_first, __result_first + __n);
-
-    auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(
-        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred);
+    auto __result = __pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
+                                        __result_first, __create_mask_op, __copy_by_mask_op);
 
-    return __result_first + __result.get();
+    return __result_first + __result.second;
 }
 
 template <typename _Name>
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 0bc7dcbb403..7dd0b9537b3 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -334,6 +334,52 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _
 // copy_if
 //------------------------------------------------------------------------
 
+template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _CreateMaskOp,
+          typename _CopyByMaskOp>
+oneapi::dpl::__internal::__difference_t<_Range1>
+__pattern_scan_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
+                    _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op)
+{
+    if (__rng1.size() == 0)
+        return __rng1.size();
+
+    using _SizeType = decltype(__rng1.size());
+    using _ReduceOp = ::std::plus<_SizeType>;
+    using _Assigner = unseq_backend::__scan_assigner;
+    using _NoAssign = unseq_backend::__scan_no_assign;
+    using _MaskAssigner = unseq_backend::__mask_assigner<1>;
+    using _InitType = unseq_backend::__no_init_value<_SizeType>;
+    using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
+
+    _Assigner __assign_op;
+    _ReduceOp __reduce_op;
+    _DataAcc __get_data_op;
+    _MaskAssigner __add_mask_op;
+
+    oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __rng1.size());
+
+    auto __res =
+        __par_backend_hetero::__parallel_transform_scan_base(
+            _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+            oneapi::dpl::__ranges::zip_view(
+                __rng1, oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
+                            __mask_buf.get_buffer())),
+            __rng2, __reduce_op, _InitType{},
+            // local scan
+            unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
+                                  _MaskAssigner, _CreateMaskOp, _InitType>{__reduce_op, __get_data_op, __assign_op,
+                                                                           __add_mask_op, __create_mask_op},
+            // scan between groups
+            unseq_backend::__scan</*inclusive*/ ::std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
+                                  _Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
+                                                                  __get_data_op},
+            // global scan
+            __copy_by_mask_op)
+            .get();
+
+    return __res;
+}
+
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Predicate,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
@@ -383,45 +429,27 @@ __pattern_remove_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec,
 // unique_copy
 //------------------------------------------------------------------------
 
-template <typename _Name>
-struct __copy_wrapper;
-
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2,
           typename _BinaryPredicate, typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result,
-                      _BinaryPredicate __pred, _Assign&& __assign)
+                      _BinaryPredicate __pred, _Assign)
 {
-    auto __n = __rng.size();
-    if (__n == 0)
-        return 0;
-    if (__n == 1)
-    {
-        using CopyBrick = oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-        oneapi::dpl::__par_backend_hetero::__parallel_for(
-            _BackendTag{},
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
-                ::std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::walk_n<_ExecutionPolicy, CopyBrick>{CopyBrick{}}, __n, std::forward<_Range1>(__rng),
-            std::forward<_Range2>(__result))
-            .get();
-
-        return 1;
-    }
-
-    return oneapi::dpl::__par_backend_hetero::__parallel_unique_copy(
-               _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-               std::forward<_Range2>(__result), __pred, std::forward<_Assign>(__assign))
-        .get();
+    using _It1DifferenceType = oneapi::dpl::__internal::__difference_t<_Range1>;
+    unseq_backend::__copy_by_mask<::std::plus<_It1DifferenceType>, _Assign, /*inclusive*/ ::std::true_type, 1>
+        __copy_by_mask_op;
+    __create_mask_unique_copy<__not_pred<_BinaryPredicate>, _It1DifferenceType> __create_mask_op{
+        __not_pred<_BinaryPredicate>{__pred}};
+
+    return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec),
+                                         ::std::forward<_Range1>(__rng), ::std::forward<_Range2>(__result),
+                                         __create_mask_op, __copy_by_mask_op);
 }
 
 //------------------------------------------------------------------------
 // unique
 //------------------------------------------------------------------------
 
-template <typename _Name>
-struct __unique_wrapper;
-
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range, typename _BinaryPredicate>
 oneapi::dpl::__internal::__difference_t<_Range>
 __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range&& __rng, _BinaryPredicate __pred)
@@ -433,13 +461,10 @@ __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra
 
     oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _ValueType> __buf(__exec, __rng.size());
     auto res_rng = oneapi::dpl::__ranges::views::all(__buf.get_buffer());
-    auto res = __ranges::__pattern_unique_copy(
-        __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__unique_wrapper>(__exec), __rng, res_rng, __pred,
-        oneapi::dpl::__internal::__pstl_assign());
+    auto res = __ranges::__pattern_unique_copy(__tag, __exec, __rng, res_rng, __pred,
+                                               oneapi::dpl::__internal::__pstl_assign());
 
-    __ranges::__pattern_walk_n(__tag,
-                               oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
-                                   ::std::forward<_ExecutionPolicy>(__exec)),
+    __ranges::__pattern_walk_n(__tag, ::std::forward<_ExecutionPolicy>(__exec),
                                __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}, res_rng,
                                ::std::forward<_Range>(__rng));
     return res;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 1e26b1148bc..5f52ebb3393 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -810,19 +810,6 @@ struct __gen_mask
     _Predicate __pred;
 };
 
-template <typename _BinaryPredicate>
-struct __gen_unique_mask
-{
-    template <typename _InRng>
-    bool
-    operator()(_InRng&& __in_rng, std::size_t __idx) const
-    {
-        //starting index is offset to 1 for "unique" patterns and 0th element copy is handled separately
-        return !__pred(__in_rng[__idx], __in_rng[__idx - 1]);
-    }
-    _BinaryPredicate __pred;
-};
-
 template <typename _GenMask>
 struct __gen_count_mask
 {
@@ -941,8 +928,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             return __parallel_transform_reduce_then_scan(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
                 std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
-                _WriteOp{}, __init, _Inclusive{},
-                /*_IsUniquePattern=*/std::false_type{});
+                _WriteOp{}, __init, _Inclusive{});
         }
     }
     {
@@ -1021,11 +1007,11 @@ struct __invoke_single_group_copy_if
 };
 
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
-          typename _WriteOp, typename _IsUniquePattern>
+          typename _WriteOp>
 auto
 __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                                  _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask,
-                                 _WriteOp __write_op, _IsUniquePattern __is_unique_pattern)
+                                 _WriteOp __write_op)
 {
     using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>;
     using _ReduceOp = std::plus<_Size>;
@@ -1036,7 +1022,7 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
         __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
         std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask},
         _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
-        /*_Inclusive=*/std::true_type{}, __is_unique_pattern);
+        /*_Inclusive=*/std::true_type{});
 }
 
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
@@ -1080,40 +1066,6 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
         __copy_by_mask_op);
 }
 
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate,
-          typename _Assign = oneapi::dpl::__internal::__pstl_assign>
-auto
-__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                       _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred,
-                       _Assign&& __assign = oneapi::dpl::__internal::__pstl_assign{})
-{
-
-    auto __n = __rng.size();
-    if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
-    {
-        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
-        using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<1, _Assign>;
-
-        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
-                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)},
-                                                /*_IsUniquePattern=*/std::true_type{});
-    }
-    else
-    {
-
-        using _ReduceOp = std::plus<decltype(__n)>;
-        using _CreateOp = oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
-                                                           decltype(__n)>;
-        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
-
-        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n,
-                                    _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
-                                    _CopyOp{_ReduceOp{}, std::forward<_Assign>(__assign)});
-    }
-}
-
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
 auto
 __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
@@ -1128,7 +1080,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
 
         return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
                                                 std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
-                                                _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{});
+                                                _GenMask{__pred}, _WriteOp{});
     }
     else
     {
@@ -1179,8 +1131,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 
         return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
                                                 std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)},
-                                                /*Unique=*/std::false_type{});
+                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)});
     }
     else
     {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d2f4cfa4aab..b5ea8ca0e7d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -266,15 +266,15 @@ template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType,
           typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType,
           typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
-                                                    __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType,
+                                                    _GenReduceInput, _ReduceOp, _InitType,
                                                     __internal::__optional_kernel_name<_KernelName...>>
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
@@ -304,11 +304,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
-                if constexpr (__is_unique_pattern_v)
-                {
-                    // for unique patterns, the first element is always copied to the output, so we need to skip it
-                    __group_start_idx += 1;
-                }
+
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
                 std::uint32_t __active_subgroups =
@@ -404,15 +400,15 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 };
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
           typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
           typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter<
-    __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp,
+    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp,
     _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
 
@@ -460,11 +456,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
 
                 auto __group_start_idx =
                     (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
-                if constexpr (__is_unique_pattern_v)
-                {
-                    // for unique patterns, the first element is always copied to the output, so we need to skip it
-                    __group_start_idx += 1;
-                }
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
@@ -617,15 +608,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     }
                     else // zeroth block, group and subgroup
                     {
-                        if constexpr (__is_unique_pattern_v)
-                        {
-                            if (__sub_group_local_id == 0)
-                            {
-                                // For unique patterns, always copy the 0th element to the output
-                                __write_op.__assign(__in_rng[0], __out_rng[0]);
-                            }
-                        }
-
                         if constexpr (std::is_same_v<_InitType,
                                                      oneapi::dpl::unseq_backend::__no_init_value<_InitValueType>>)
                         {
@@ -688,14 +670,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 {
                     if (__block_num + 1 == __num_blocks)
                     {
-                        if constexpr (__is_unique_pattern_v)
-                        {
-                            __res_ptr[0] = __sub_group_carry.__v + 1;
-                        }
-                        else
-                        {
-                            __res_ptr[0] = __sub_group_carry.__v;
-                        }
+                        __res_ptr[0] = __sub_group_carry.__v;
                     }
                     else
                     {
@@ -745,13 +720,13 @@ __prefer_reduce_then_scan(const _ExecutionPolicy& __exec)
 //            and performs the final write to output operation
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _GenReduceInput, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
-          typename _Inclusive, typename _IsUniquePattern>
+          typename _Inclusive>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
                                       _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input,
                                       _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,
                                       _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init,
-                                      _Inclusive, _IsUniquePattern)
+                                      _Inclusive)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
     using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -764,7 +739,6 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;
-    constexpr bool __is_unique_pattern_v = _IsUniquePattern::value;
 
     // TODO: This min call is temporary until PR #1683 is merged.
     const std::size_t __work_group_size = std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
@@ -778,12 +752,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const std::size_t __n = __in_rng.size();
     const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
     std::size_t __num_remaining = __n;
-    if constexpr (__is_unique_pattern_v)
-    {
-        // skip scan of zeroth element in unique patterns
-        __num_remaining -= 1;
-    }
-    // reduce_then_scan kernel is not built to handle "empty" scans which includes `__n == 1` for unique patterns.
+
+    // reduce_then_scan kernel is not built to handle "empty".
     // These trivial end cases should be handled at a higher level.
     assert(__num_remaining > 0);
     auto __inputs_per_sub_group =
@@ -804,11 +774,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // Reduce and scan step implementations
     using _ReduceSubmitter =
         __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                     __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType,
+                                                     _GenReduceInput, _ReduceOp, _InitType,
                                                      _ReduceKernel>;
     using _ScanSubmitter =
         __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                   __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _GenScanInput,
+                                                   _GenReduceInput, _ReduceOp, _GenScanInput,
                                                    _ScanInputTransform, _WriteOp, _InitType, _ScanKernel>;
     _ReduceSubmitter __reduce_submitter{__max_inputs_per_block,
                                         __num_sub_groups_local,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 9a935152446..e4a61210193 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -239,9 +239,6 @@ struct __gen_transform_input;
 template <typename _Predicate>
 struct __gen_mask;
 
-template <typename _BinaryPredicate>
-struct __gen_unique_mask;
-
 template <typename _GenMask>
 struct __gen_count_mask;
 
@@ -272,12 +269,6 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
-template <typename _BinaryPredicate>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask,
-                                                       _BinaryPredicate)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPredicate>
-{
-};
 template <typename _GenMask>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 25f5fc2e608..5f8b9c1a459 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -154,11 +154,6 @@ test_device_copyable()
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
                   "__gen_mask is not device copyable with device copyable types");
 
-    //__gen_unique_mask
-    static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_unique_mask<binary_op_device_copyable>>,
-        "__gen_unique_mask is not device copyable with device copyable types");
-
     //__gen_count_mask
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
                       oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>>,
@@ -386,11 +381,6 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
                   "__gen_mask is device copyable with non device copyable types");
 
-    //__gen_unique_mask
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::__par_backend_hetero::__gen_unique_mask<binary_op_non_device_copyable>>,
-                  "__gen_unique_mask is device copyable with non device copyable types");
-
     //__gen_count_mask
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
                       oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>>,
diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h
index 32e02991933..ea5d7a63240 100644
--- a/test/support/utils_device_copyable.h
+++ b/test/support/utils_device_copyable.h
@@ -73,32 +73,6 @@ struct assign_device_copyable
     }
 };
 
-// Device copyable binary operator binary operators.
-// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
-// relying on trivial copyability
-struct binary_op_non_device_copyable
-{
-    binary_op_non_device_copyable(const binary_op_non_device_copyable& other)
-    {
-        std::cout << " non trivial copy ctor\n";
-    }
-    int
-    operator()(int a, int b) const
-    {
-        return a;
-    }
-};
-
-struct binary_op_device_copyable
-{
-    binary_op_device_copyable(const binary_op_device_copyable& other) { std::cout << " non trivial copy ctor\n"; }
-    int
-    operator()(int a, int b) const
-    {
-        return a;
-    }
-};
-
 // Device copyable int wrapper struct used in testing as surrogate for values, value types, etc.
 // Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
 // relying on trivial copyability
@@ -216,11 +190,6 @@ struct sycl::is_device_copyable<TestUtils::assign_device_copyable> : std::true_t
 {
 };
 
-template <>
-struct sycl::is_device_copyable<TestUtils::binary_op_device_copyable> : std::true_type
-{
-};
-
 template <>
 struct sycl::is_device_copyable<TestUtils::int_device_copyable> : std::true_type
 {

From 826751394210756a6f2307a21aabde005125cae8 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 12:01:14 -0400
Subject: [PATCH 29/88] remove partition pattern family from reduce_then_scan

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   | 25 ++++------
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 46 -------------------
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       | 10 ----
 .../device_copyable.pass.cpp                  | 10 ----
 4 files changed, 10 insertions(+), 81 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 8f247791f38..4334fb1d51c 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -952,24 +952,19 @@ __pattern_partition_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
         return ::std::make_pair(__result1, __result2);
 
     using _It1DifferenceType = typename ::std::iterator_traits<_Iterator1>::difference_type;
+    using _ReduceOp = ::std::plus<_It1DifferenceType>;
 
-    _It1DifferenceType __n = __last - __first;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
-    auto __buf1 = __keep1(__first, __last);
+    unseq_backend::__create_mask<_UnaryPredicate, _It1DifferenceType> __create_mask_op{__pred};
+    unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ ::std::true_type> __copy_by_mask_op{_ReduceOp{}};
 
-    auto __zipped_res = __par_backend_hetero::zip(
-        __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1),
-        __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2));
-
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>();
-    auto __buf2 = __keep2(__zipped_res, __zipped_res + __n);
-
-    auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy(
-        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred);
+    auto __result = __pattern_scan_copy(
+        __tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
+        __par_backend_hetero::zip(
+            __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result1),
+            __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__result2)),
+        __create_mask_op, __copy_by_mask_op);
 
-    return std::make_pair(__result1 + __result.get(), __result2 + (__last - __first - __result.get()));
+    return ::std::make_pair(__result1 + __result.second, __result2 + (__last - __first - __result.second));
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5f52ebb3393..e369f01cf8a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -868,25 +868,6 @@ struct __write_to_idx_if
     Assign __assign;
 };
 
-template <typename Assign = oneapi::dpl::__internal::__pstl_assign>
-struct __write_to_idx_if_else
-{
-    template <typename _OutRng, typename _SizeType, typename ValueType>
-    void
-    operator()(_OutRng&& __out_rng, _SizeType __idx, const ValueType& __v) const
-    {
-        using _ConvertedTupleType =
-            typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
-                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
-        if (std::get<1>(__v))
-            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), std::get<0>(__out_rng[std::get<0>(__v) - 1]));
-        else
-            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)),
-                     std::get<1>(__out_rng[__idx - std::get<0>(__v)]));
-    }
-    Assign __assign;
-};
-
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
           typename _BinaryOperation, typename _Inclusive>
 auto
@@ -1066,33 +1047,6 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
         __copy_by_mask_op);
 }
 
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
-auto
-__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                          _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
-{
-    auto __n = __rng.size();
-    if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
-    {
-        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
-        using _WriteOp =
-            oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<oneapi::dpl::__internal::__pstl_assign>;
-
-        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
-                                                _GenMask{__pred}, _WriteOp{});
-    }
-    else
-    {
-        using _ReduceOp = std::plus<decltype(__n)>;
-        using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
-        using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
-
-        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
-    }
-}
-
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 auto
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index e4a61210193..3efb6cf1547 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -248,9 +248,6 @@ struct __gen_expand_count_mask;
 template <int32_t __offset, typename Assign>
 struct __write_to_idx_if;
 
-template <typename Assign>
-struct __write_to_idx_if_else;
-
 template <typename _ExecutionPolicy, typename _Pred>
 struct __early_exit_find_or;
 
@@ -289,13 +286,6 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
-template <typename Assign>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else,
-                                                       Assign)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<Assign>
-{
-};
-
 template <typename _ExecutionPolicy, typename _Pred>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or,
                                                        _ExecutionPolicy, _Pred)>
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 5f8b9c1a459..7c51200e9e8 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -169,11 +169,6 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_device_copyable>>,
         "__write_to_idx_if is not device copyable with device copyable types");
 
-    //__write_to_idx_if_else
-    static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<assign_device_copyable>>,
-        "__write_to_idx_if_else is not device copyable with device copyable types");
-
     // __early_exit_find_or
     static_assert(
         sycl::is_device_copyable_v<
@@ -396,11 +391,6 @@ test_non_device_copyable()
                       oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_non_device_copyable>>,
                   "__write_to_idx_if is device copyable with non device copyable types");
 
-    //__write_to_idx_if_else
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::__par_backend_hetero::__write_to_idx_if_else<assign_non_device_copyable>>,
-                  "__write_to_idx_if_else is device copyable with non device copyable types");
-
     // __early_exit_find_or
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__early_exit_find_or<policy_non_device_copyable,

From d37746ebd47f88d58f38f844fb41410c6e5d9fcd Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 12:37:30 -0400
Subject: [PATCH 30/88] remove copy_if pattern family from reduce_then_scan

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/algorithm_ranges_impl_hetero.h     |  16 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 161 +++---------------
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  38 -----
 .../device_copyable.pass.cpp                  |  38 -----
 test/support/utils_device_copyable.h          |  30 ----
 5 files changed, 34 insertions(+), 249 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 7dd0b9537b3..1df37f8acbc 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -384,17 +384,17 @@ template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typ
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
-                  _Predicate __pred, _Assign&& __assign)
+                  _Predicate __pred, _Assign)
 {
-    auto __n = __rng1.size();
-    if (__n == 0)
-        return 0;
+    using _SizeType = decltype(__rng1.size());
+    using _ReduceOp = ::std::plus<_SizeType>;
 
-    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
-        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
-        std::forward<_Range2>(__rng2), __n, __pred, std::forward<_Assign>(__assign));
+    unseq_backend::__create_mask<_Predicate, _SizeType> __create_mask_op{__pred};
+    unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ ::std::true_type, 1> __copy_by_mask_op;
 
-    return __res.get(); //is a blocking call
+    return __ranges::__pattern_scan_copy(__tag, ::std::forward<_ExecutionPolicy>(__exec),
+                                         ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2),
+                                         __create_mask_op, __copy_by_mask_op);
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index e369f01cf8a..765a16c1422 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -555,10 +555,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                                                         __internal::__optional_kernel_name<_ScanKernelName...>>
 {
     template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
-              typename _UnaryOp, typename _Assign>
+              typename _UnaryOp>
     auto
     operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
-               _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign)
+               _BinaryOperation __bin_op, _UnaryOp __unary_op)
     {
         using _ValueType = ::std::uint16_t;
 
@@ -622,8 +622,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                     for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
                     {
                         if (__lacc[__idx])
-                            __assign(static_cast<__tuple_type>(__in_rng[__idx]),
-                                     __out_rng[__lacc[__idx + __elems_per_wg]]);
+                            __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
                     }
 
                     const ::std::uint16_t __residual = __n % _WGSize;
@@ -632,8 +631,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                     {
                         auto __idx = __residual_start + __item_id;
                         if (__lacc[__idx])
-                            __assign(static_cast<__tuple_type>(__in_rng[__idx]),
-                                     __out_rng[__lacc[__idx + __elems_per_wg]]);
+                            __out_rng[__lacc[__idx + __elems_per_wg]] = static_cast<__tuple_type>(__in_rng[__idx]);
                     }
 
                     if (__item_id == 0)
@@ -798,76 +796,6 @@ struct __simple_write_to_idx
     }
 };
 
-template <typename _Predicate>
-struct __gen_mask
-{
-    template <typename _InRng>
-    bool
-    operator()(_InRng&& __in_rng, std::size_t __idx) const
-    {
-        return __pred(__in_rng[__idx]);
-    }
-    _Predicate __pred;
-};
-
-template <typename _GenMask>
-struct __gen_count_mask
-{
-    template <typename _InRng, typename _SizeType>
-    _SizeType
-    operator()(_InRng&& __in_rng, _SizeType __idx) const
-    {
-        return __gen_mask(std::forward<_InRng>(__in_rng), __idx) ? _SizeType{1} : _SizeType{0};
-    }
-    _GenMask __gen_mask;
-};
-
-template <typename _GenMask>
-struct __gen_expand_count_mask
-{
-    template <typename _InRng, typename _SizeType>
-    auto
-    operator()(_InRng&& __in_rng, _SizeType __idx) const
-    {
-        // Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
-        //  zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
-        //  the values of zipped the input types rather than their references.
-        using _ElementType =
-            oneapi::dpl::__internal::__decay_with_tuple_specialization_t<oneapi::dpl::__internal::__value_t<_InRng>>;
-        _ElementType ele = __in_rng[__idx];
-        bool mask = __gen_mask(__in_rng, __idx);
-        return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
-    }
-    _GenMask __gen_mask;
-};
-
-struct __get_zeroth_element
-{
-    template <typename _Tp>
-    auto&
-    operator()(_Tp&& __a) const
-    {
-        return std::get<0>(std::forward<_Tp>(__a));
-    }
-};
-template <std::int32_t __offset = 0, typename Assign = oneapi::dpl::__internal::__pstl_assign>
-struct __write_to_idx_if
-{
-    template <typename _OutRng, typename _SizeType, typename ValueType>
-    void
-    operator()(_OutRng&& __out_rng, _SizeType __idx, const ValueType& __v) const
-    {
-        // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
-        // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
-        using _ConvertedTupleType =
-            typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
-                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
-        if (std::get<1>(__v))
-            __assign(static_cast<_ConvertedTupleType>(std::get<2>(__v)), __out_rng[std::get<0>(__v) - 1 + __offset]);
-    }
-    Assign __assign;
-};
-
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
           typename _BinaryOperation, typename _Inclusive>
 auto
@@ -944,16 +872,13 @@ struct __invoke_single_group_copy_if
     // Specialization for devices that have a max work-group size of at least 1024
     static constexpr ::std::uint16_t __targeted_wg_size = 1024;
 
-    template <std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
-              typename _Assign = oneapi::dpl::__internal::__pstl_assign>
+    template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred>
     auto
-    operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred,
-               _Assign&& __assign)
+    operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred)
     {
         constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
         constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
         const bool __is_full_group = __n == __wg_size;
-
         using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
         using _InitType = unseq_backend::__no_init_value<::std::uint16_t>;
         using _ReduceOp = ::std::plus<::std::uint16_t>;
@@ -967,8 +892,7 @@ struct __invoke_single_group_copy_if
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, true, _FullKernelName>()(
                 std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred),
-                std::forward<_Assign>(__assign));
+                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred));
         }
         else
         {
@@ -981,31 +905,11 @@ struct __invoke_single_group_copy_if
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, false, _NonFullKernelName>()(
                 std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred),
-                std::forward<_Assign>(__assign));
+                std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, std::forward<_Pred>(__pred));
         }
     }
 };
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
-          typename _WriteOp>
-auto
-__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                                 _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask,
-                                 _WriteOp __write_op)
-{
-    using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>;
-    using _ReduceOp = std::plus<_Size>;
-    using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>;
-    using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;
-
-    return __parallel_transform_reduce_then_scan(
-        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-        std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask},
-        _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
-        /*_Inclusive=*/std::true_type{});
-}
-
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
           typename _CopyByMaskOp>
 auto
@@ -1013,24 +917,22 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
                      _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
                      _CopyByMaskOp __copy_by_mask_op)
 {
-    using _ReduceOp = std::plus<_Size>;
+    using _ReduceOp = ::std::plus<_Size>;
     using _Assigner = unseq_backend::__scan_assigner;
     using _NoAssign = unseq_backend::__scan_no_assign;
     using _MaskAssigner = unseq_backend::__mask_assigner<1>;
     using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
     using _InitType = unseq_backend::__no_init_value<_Size>;
-
     _Assigner __assign_op;
     _ReduceOp __reduce_op;
     _DataAcc __get_data_op;
     _MaskAssigner __add_mask_op;
-
     // temporary buffer to store boolean mask
     oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n);
 
     return __parallel_transform_scan_base(
         __backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
-        oneapi::dpl::__ranges::zip_view(
+        oneapi::dpl::__ranges::make_zip_view(
             ::std::forward<_InRng>(__in_rng),
             oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
                 __mask_buf.get_buffer())),
@@ -1047,56 +949,45 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
         __copy_by_mask_op);
 }
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
-          typename _Assign = oneapi::dpl::__internal::__pstl_assign>
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred>
 auto
 __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                   _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign&& __assign = _Assign{})
+                   _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred)
 {
     using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
 
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<::std::make_unsigned_t<_Size>>(__n));
-
     // Pessimistically only use half of the memory to take into account memory used by compiled kernel
     const ::std::size_t __max_slm_size =
         __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
-
     // The kernel stores n integers for the predicate and another n integers for the offsets
     const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;
 
-    constexpr ::std::uint16_t __single_group_upper_limit = 2048;
+    constexpr ::std::uint16_t __single_group_upper_limit = 16384;
 
-    std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
     if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
         __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
     {
-        using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
+        using _SizeBreakpoints =
+            ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
 
         return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
-            _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
-            std::forward<_OutRng>(__out_rng), __pred, std::forward<_Assign>(__assign));
-    }
-    else if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
-    {
-        using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
-        using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, _Assign>;
-
-        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                                _GenMask{__pred}, _WriteOp{std::forward<_Assign>(__assign)});
+            _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
+            ::std::forward<_OutRng>(__out_rng), __pred);
     }
     else
     {
-        using _ReduceOp = std::plus<_Size>;
-        using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
-        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
-                                                     /*inclusive*/ std::true_type, 1>;
-
-        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
-                                    _CreateOp{__pred}, _CopyOp{_ReduceOp{}, std::forward<_Assign>(__assign)});
+        using _ReduceOp = ::std::plus<_Size>;
+        using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+        using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
+                                                     /*inclusive*/ ::std::true_type, 1>;
+
+        return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
+                                    ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n,
+                                    CreateOp{__pred}, CopyOp{});
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 3efb6cf1547..b820741ea00 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -236,18 +236,6 @@ namespace oneapi::dpl::__par_backend_hetero
 template <typename _UnaryOp>
 struct __gen_transform_input;
 
-template <typename _Predicate>
-struct __gen_mask;
-
-template <typename _GenMask>
-struct __gen_count_mask;
-
-template <typename _GenMask>
-struct __gen_expand_count_mask;
-
-template <int32_t __offset, typename Assign>
-struct __write_to_idx_if;
-
 template <typename _ExecutionPolicy, typename _Pred>
 struct __early_exit_find_or;
 
@@ -260,32 +248,6 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
-template <typename _Predicate>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
-{
-};
-
-template <typename _GenMask>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_count_mask, _GenMask)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
-{
-};
-
-template <typename _GenMask>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask,
-                                                       _GenMask)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
-{
-};
-
-template <int32_t __offset, typename Assign>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_to_idx_if, __offset,
-                                                       Assign)>
-    : oneapi::dpl::__internal::__are_all_device_copyable<Assign>
-{
-};
-
 template <typename _ExecutionPolicy, typename _Pred>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or,
                                                        _ExecutionPolicy, _Pred)>
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 7c51200e9e8..481a5cb1c05 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -150,25 +150,6 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_device_copyable>>,
         "__gen_transform_input is not device copyable with device copyable types");
 
-    //__gen_mask
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
-                  "__gen_mask is not device copyable with device copyable types");
-
-    //__gen_count_mask
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
-                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>>,
-                  "__gen_count_mask is not device copyable with device copyable types");
-
-    //__gen_expand_count_mask
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<
-                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>>,
-                  "__gen_expand_count_mask is not device copyable with device copyable types");
-
-    //__write_to_idx_if
-    static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_device_copyable>>,
-        "__write_to_idx_if is not device copyable with device copyable types");
-
     // __early_exit_find_or
     static_assert(
         sycl::is_device_copyable_v<
@@ -372,25 +353,6 @@ test_non_device_copyable()
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_non_device_copyable>>,
         "__gen_transform_input is device copyable with non device copyable types");
 
-    //__gen_mask
-    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
-                  "__gen_mask is device copyable with non device copyable types");
-
-    //__gen_count_mask
-    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_count_mask<
-                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>>,
-                  "__gen_count_mask is device copyable with non device copyable types");
-
-    //__gen_expand_count_mask
-    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<
-                      oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>>,
-                  "__gen_expand_count_mask is device copyable with non device copyable types");
-
-    //__write_to_idx_if
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::__par_backend_hetero::__write_to_idx_if<0, assign_non_device_copyable>>,
-                  "__write_to_idx_if is device copyable with non device copyable types");
-
     // __early_exit_find_or
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__early_exit_find_or<policy_non_device_copyable,
diff --git a/test/support/utils_device_copyable.h b/test/support/utils_device_copyable.h
index ea5d7a63240..7b98de6501c 100644
--- a/test/support/utils_device_copyable.h
+++ b/test/support/utils_device_copyable.h
@@ -48,31 +48,6 @@ struct noop_non_device_copyable
     }
 };
 
-// Device copyable assignment callable.
-// Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
-// relying on trivial copyability
-struct assign_non_device_copyable
-{
-    assign_non_device_copyable(const assign_non_device_copyable& other) { std::cout << "non trivial copy ctor\n"; }
-    template <typename _Xp, typename _Yp>
-    void
-    operator()(const _Xp& __x, _Yp&& __y) const
-    {
-        std::forward<_Yp>(__y) = __x;
-    }
-};
-
-struct assign_device_copyable
-{
-    assign_device_copyable(const assign_device_copyable& other) { std::cout << "non trivial copy ctor\n"; }
-    template <typename _Xp, typename _Yp>
-    void
-    operator()(const _Xp& __x, _Yp&& __y) const
-    {
-        std::forward<_Yp>(__y) = __x;
-    }
-};
-
 // Device copyable int wrapper struct used in testing as surrogate for values, value types, etc.
 // Intentionally non-trivially copyable to test that device_copyable speciailzation works and we are not
 // relying on trivial copyability
@@ -185,11 +160,6 @@ struct sycl::is_device_copyable<TestUtils::noop_device_copyable> : std::true_typ
 {
 };
 
-template <>
-struct sycl::is_device_copyable<TestUtils::assign_device_copyable> : std::true_type
-{
-};
-
 template <>
 struct sycl::is_device_copyable<TestUtils::int_device_copyable> : std::true_type
 {

From 404c4ef03e7e0bea7719aefb09525bba1bb5bddd Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 16:44:15 -0400
Subject: [PATCH 31/88] remove unnecessary barrier + cleanup unnecessary lazy
 value

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 65 +++++++++----------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index b5ea8ca0e7d..8b200ef21ef 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -462,7 +462,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
-                oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __value;
 
                 // propogate carry in from previous block
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
@@ -517,27 +516,27 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
                                                        ? __proposed_idx
                                                        : __subgroups_before_my_group - 1;
-                            __value.__setup(__tmp_ptr[__reduction_idx]);
+                            auto __value = __tmp_ptr[__reduction_idx];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
-                                                     /*__init_present=*/false>(__sub_group, __value.__v, __reduce_op,
+                                                     /*__init_present=*/false>(__sub_group, __value, __reduce_op,
                                                                                __carry_last, __remaining_elements);
                         }
                         else
                         {
                             // multiple iterations
                             // first 1 full
-                            __value.__setup(__tmp_ptr[__num_sub_groups_local * __sub_group_local_id + __offset]);
+                            auto __value = __tmp_ptr[__num_sub_groups_local * __sub_group_local_id + __offset];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
-                                __sub_group, __value.__v, __reduce_op, __carry_last);
+                                __sub_group, __value, __reduce_op, __carry_last);
 
                             // then some number of full iterations
                             for (std::uint32_t __i = 1; __i < __pre_carry_iters - 1; __i++)
                             {
                                 auto __reduction_idx = __i * __num_sub_groups_local * __sub_group_size +
                                                        __num_sub_groups_local * __sub_group_local_id + __offset;
-                                __value.__v = __tmp_ptr[__reduction_idx];
+                                __value = __tmp_ptr[__reduction_idx];
                                 __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
-                                    __sub_group, __value.__v, __reduce_op, __carry_last);
+                                    __sub_group, __value, __reduce_op, __carry_last);
                             }
 
                             // final partial iteration
@@ -548,43 +547,39 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
                                                        ? __proposed_idx
                                                        : __subgroups_before_my_group - 1;
-                            __value.__v = __tmp_ptr[__reduction_idx];
+                            __value = __tmp_ptr[__reduction_idx];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
-                                                     /*__init_present=*/true>(__sub_group, __value.__v, __reduce_op,
+                                                     /*__init_present=*/true>(__sub_group, __value, __reduce_op,
                                                                               __carry_last, __remaining_elements);
                         }
-                    }
-                }
 
-                __dpl_sycl::__group_barrier(__ndi);
+                        // steps 3/4) load global carry in from neighbor work-group
+                        //            and apply to local sub-group prefix carries
+                        auto __carry_offset = 0;
 
-                // steps 3/4) load global carry in from neighbor work-group
-                //            and apply to local sub-group prefix carries
-                if ((__sub_group_id == 0) && (__g > 0))
-                {
-                    auto __carry_offset = 0;
+                        std::uint8_t __iters =
+                            oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
 
-                    std::uint8_t __iters =
-                        oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
+                        std::uint8_t __i = 0;
+                        for (; __i < __iters - 1; ++__i)
+                        {
+                            __sub_group_partials[__carry_offset + __sub_group_local_id] =
+                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __carry_offset += __sub_group_size;
+                        }
+                        if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
+                        {
+                            __sub_group_partials[__carry_offset + __sub_group_local_id] =
+                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __carry_offset += __sub_group_size;
+                        }
+                        if (__sub_group_local_id == 0)
+                            __sub_group_partials[__active_subgroups] = __carry_last.__v;
+                        __carry_last.__destroy();
 
-                    std::uint8_t __i = 0;
-                    for (; __i < __iters - 1; ++__i)
-                    {
-                        __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                            __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
-                        __carry_offset += __sub_group_size;
-                    }
-                    if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
-                    {
-                        __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                            __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
-                        __carry_offset += __sub_group_size;
                     }
-                    if (__sub_group_local_id == 0)
-                        __sub_group_partials[__active_subgroups] = __carry_last.__v;
-                    __carry_last.__destroy();
+
                 }
-                __value.__destroy();
 
                 __dpl_sycl::__group_barrier(__ndi);
 

From 060f649b4dfb8e5116feeaca49a12cd1554f42d4 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 16:53:55 -0400
Subject: [PATCH 32/88] clang format

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 55 +++++++++----------
 .../dpcpp/parallel_backend_sycl_utils.h       |  4 +-
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 8b200ef21ef..8d91d8279ab 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -266,13 +266,11 @@ template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _InitType,
-          typename _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _InitType,
-          typename... _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
                                                     _GenReduceInput, _ReduceOp, _InitType,
                                                     __internal::__optional_kernel_name<_KernelName...>>
@@ -345,8 +343,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                               : (__active_subgroups - 1); // else is unused dummy value
                         auto __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
-                            __sub_group, __v, __reduce_op, __sub_group_carry,
-                            __active_subgroups);
+                            __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
                         if (__sub_group_local_id < __active_subgroups)
                             __temp_ptr[__start_idx + __sub_group_local_id] = __v;
                     }
@@ -400,26 +397,28 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 };
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
-          typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
+          typename _WriteOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
 template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput,
-          typename _ScanInputTransform, typename _WriteOp, typename _InitType, typename... _KernelName>
+          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
+          typename _WriteOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter<
-    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp,
-    _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
+    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput,
+    _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
 
     template <typename _TmpPtr>
-    auto __get_block_carry_in(const std::size_t __block_num, _TmpPtr __tmp_ptr) const
+    auto
+    __get_block_carry_in(const std::size_t __block_num, _TmpPtr __tmp_ptr) const
     {
         return __tmp_ptr[__num_sub_groups_global + (__block_num % 2)];
     }
 
     template <typename _TmpPtr, typename _ValueType>
-    void __set_block_carry_out(const std::size_t __block_num, _TmpPtr __tmp_ptr, const _ValueType __block_carry_out) const
+    void
+    __set_block_carry_out(const std::size_t __block_num, _TmpPtr __tmp_ptr, const _ValueType __block_carry_out) const
     {
         __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
     }
@@ -563,22 +562,20 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         std::uint8_t __i = 0;
                         for (; __i < __iters - 1; ++__i)
                         {
-                            __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __sub_group_partials[__carry_offset + __sub_group_local_id] = __reduce_op(
+                                __carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
                         {
-                            __sub_group_partials[__carry_offset + __sub_group_local_id] =
-                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __sub_group_partials[__carry_offset + __sub_group_local_id] = __reduce_op(
+                                __carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__sub_group_local_id == 0)
                             __sub_group_partials[__active_subgroups] = __carry_last.__v;
                         __carry_last.__destroy();
-
                     }
-
                 }
 
                 __dpl_sycl::__group_barrier(__ndi);
@@ -627,8 +624,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     }
                     else if (__g > 0)
                     {
-                        __sub_group_carry.__setup(
-                            __reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __sub_group_partials[__active_subgroups]));
+                        __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr),
+                                                              __sub_group_partials[__active_subgroups]));
                     }
                     else
                     {
@@ -736,7 +733,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     constexpr bool __inclusive = _Inclusive::value;
 
     // TODO: This min call is temporary until PR #1683 is merged.
-    const std::size_t __work_group_size = std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
+    const std::size_t __work_group_size =
+        std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
 
     // TODO: Investigate potentially basing this on some scale of the number of compute units. 128 work-groups has been
     // found to be reasonable number for most devices.
@@ -763,18 +761,17 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
     // the block carry-out within a single kernel.
-    __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _ValueType> __result_and_scratch{__exec,
-                                                                                    __num_sub_groups_global + 2};
+    __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _ValueType> __result_and_scratch{
+        __exec, __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =
         __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                     _GenReduceInput, _ReduceOp, _InitType,
-                                                     _ReduceKernel>;
+                                                     _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>;
     using _ScanSubmitter =
         __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                   _GenReduceInput, _ReduceOp, _GenScanInput,
-                                                   _ScanInputTransform, _WriteOp, _InitType, _ScanKernel>;
+                                                   _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform,
+                                                   _WriteOp, _InitType, _ScanKernel>;
     _ReduceSubmitter __reduce_submitter{__max_inputs_per_block,
                                         __num_sub_groups_local,
                                         __num_sub_groups_global,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 340b5fda68b..f7bc8f7a387 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -509,6 +509,7 @@ template <typename _ExecutionPolicy, typename _T>
 struct __result_and_scratch_storage
 {
     using __value_type = _T;
+
   private:
     using __sycl_buffer_t = sycl::buffer<_T, 1>;
 
@@ -820,8 +821,7 @@ bool
 __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
 {
     const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
-    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) !=
-           __subgroup_sizes.end();
+    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) != __subgroup_sizes.end();
 }
 
 } // namespace __par_backend_hetero

From 0beebd14038a558fcd5eceb4a2152d7adef145e7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 5 Aug 2024 16:55:00 -0400
Subject: [PATCH 33/88] codespell

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 8d91d8279ab..06dcc9f21cf 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -462,7 +462,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
 
-                // propogate carry in from previous block
+                // propagate carry in from previous block
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
 
                 // on the first sub-group in a work-group (assuming S subgroups in a work-group):
@@ -705,8 +705,8 @@ __prefer_reduce_then_scan(const _ExecutionPolicy& __exec)
 //                   used in the reduction operation (to calculate the global carries)
 // _GenScanInput - a function which accepts the input range and index to generate the data needed by the final scan
 //                 and write operations, for scan patterns
-// _ScanInputTransform - a unary function applied to the ouput of `_GenScanInput` to extract the component used in the scan, but
-//             not the part only required for the final write operation
+// _ScanInputTransform - a unary function applied to the output of `_GenScanInput` to extract the component used in the
+//             scan, but not the part only required for the final write operation
 // _ReduceOp - a binary function which is used in the reduction and scan operations
 // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range
 //            and performs the final write to output operation

From 90e6e62828407a090fbe11f92ca2d44cc99bd0bc Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 09:01:34 -0400
Subject: [PATCH 34/88] restoring whitespace only changes

---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 765a16c1422..9dd7d5c978a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -879,6 +879,7 @@ struct __invoke_single_group_copy_if
         constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
         constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
         const bool __is_full_group = __n == __wg_size;
+
         using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
         using _InitType = unseq_backend::__no_init_value<::std::uint16_t>;
         using _ReduceOp = ::std::plus<::std::uint16_t>;
@@ -923,10 +924,12 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
     using _MaskAssigner = unseq_backend::__mask_assigner<1>;
     using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
     using _InitType = unseq_backend::__no_init_value<_Size>;
+
     _Assigner __assign_op;
     _ReduceOp __reduce_op;
     _DataAcc __get_data_op;
     _MaskAssigner __add_mask_op;
+
     // temporary buffer to store boolean mask
     oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n);
 
@@ -958,9 +961,11 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<::std::make_unsigned_t<_Size>>(__n));
+
     // Pessimistically only use half of the memory to take into account memory used by compiled kernel
     const ::std::size_t __max_slm_size =
         __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
+
     // The kernel stores n integers for the predicate and another n integers for the offsets
     const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;
 

From ef5d3778d5cace608ade3062f5f99153cb996765 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 10:15:34 -0400
Subject: [PATCH 35/88] removing unnecessary using

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index f7bc8f7a387..39faf4b6750 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -508,8 +508,6 @@ struct __usm_or_buffer_accessor
 template <typename _ExecutionPolicy, typename _T>
 struct __result_and_scratch_storage
 {
-    using __value_type = _T;
-
   private:
     using __sycl_buffer_t = sycl::buffer<_T, 1>;
 

From bca00021cf38df538b9a681d331060f4ad662f41 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 13:25:27 -0400
Subject: [PATCH 36/88] reverting formatting only changes

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 9dd7d5c978a..f82bbc1b523 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -512,8 +512,8 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
                         }
                     }
 
-                    __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n, __lacc_ptr,
-                                                              __bin_op, __init);
+                    __scan_work_group<_ValueType, _Inclusive>(__group, __lacc_ptr, __lacc_ptr + __n,
+                                                              __lacc_ptr, __bin_op, __init);
 
                     if constexpr (__can_use_subgroup_load_store)
                     {
@@ -617,7 +617,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
 
                     __scan_work_group<_ValueType, /* _Inclusive */ false>(
                         __group, __lacc_ptr, __lacc_ptr + __elems_per_wg, __lacc_ptr + __elems_per_wg, __bin_op,
-                        __init);
+                         __init);
 
                     for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
                     {

From 68c75e59500fe43f951f589b5066f05963b3325c Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 14:08:25 -0400
Subject: [PATCH 37/88] remove max and TODO

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 06dcc9f21cf..f36ba38aa85 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -732,9 +732,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;
 
-    // TODO: This min call is temporary until PR #1683 is merged.
-    const std::size_t __work_group_size =
-        std::min(std::size_t(8192), oneapi::dpl::__internal::__max_work_group_size(__exec));
+    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
     // TODO: Investigate potentially basing this on some scale of the number of compute units. 128 work-groups has been
     // found to be reasonable number for most devices.

From dddb0501ed8c6ada3bc21df117c7e56d27181c4e Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 16:49:15 -0400
Subject: [PATCH 38/88] remove extra braces, add comments

---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 48 +++++++++----------
 .../parallel_backend_sycl_reduce_then_scan.h  |  7 ++-
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index f82bbc1b523..c1715cd028e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -840,30 +840,30 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                 _WriteOp{}, __init, _Inclusive{});
         }
     }
-    {
-        using _Assigner = unseq_backend::__scan_assigner;
-        using _NoAssign = unseq_backend::__scan_no_assign;
-        using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
-        using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
-
-        _Assigner __assign_op;
-        _NoAssign __no_assign_op;
-        _NoOpFunctor __get_data_op;
-
-        return __parallel_transform_scan_base(
-            __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-            std::forward<_Range2>(__out_rng), __binary_op, __init,
-            // local scan
-            unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
-                                  _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
-                                                           __assign_op, __get_data_op},
-            // scan between groups
-            unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                                  _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-                __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-            // global scan
-            unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
-    }
+    //else use legacy scan implementation
+
+    using _Assigner = unseq_backend::__scan_assigner;
+    using _NoAssign = unseq_backend::__scan_no_assign;
+    using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
+    using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
+
+    _Assigner __assign_op;
+    _NoAssign __no_assign_op;
+    _NoOpFunctor __get_data_op;
+
+    return __parallel_transform_scan_base(
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+        std::forward<_Range2>(__out_rng), __binary_op, __init,
+        // local scan
+        unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
+                                _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
+                                                        __assign_op, __get_data_op},
+        // scan between groups
+        unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
+                                _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+            __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
+        // global scan
+        unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
 }
 
 template <typename _SizeType>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index f36ba38aa85..f02c22c660f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -162,6 +162,7 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
     bool __is_full_thread = __subgroup_start_idx + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread && __is_full_block)
     {
+        // For full block and full thread, we can unroll the loop
         auto __v = __gen_input(__in_rng, __start_idx);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
@@ -184,6 +185,9 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
     }
     else if (__is_full_thread)
     {
+        // For full thread but not full block, we can't unroll the loop, but we
+        // can proceed without special casing for partial subgroups.
+
         auto __v = __gen_input(__in_rng, __start_idx);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
@@ -204,6 +208,7 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
     }
     else
     {
+        // For partial thread, we need to handle the partial subgroup at the end of the range
         if (__sub_group_id < __active_subgroups)
         {
             auto __iters = oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_idx, __sub_group_size);
@@ -552,7 +557,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                                               __carry_last, __remaining_elements);
                         }
 
-                        // steps 3/4) load global carry in from neighbor work-group
+                        // steps 3+4) load global carry in from neighbor work-group
                         //            and apply to local sub-group prefix carries
                         auto __carry_offset = 0;
 

From dc2de2602aad83515eb391870a0bb337eb4fb64c Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 6 Aug 2024 16:58:12 -0400
Subject: [PATCH 39/88] removing formatting only changes

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c1715cd028e..702ecc3f654 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -840,8 +840,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                 _WriteOp{}, __init, _Inclusive{});
         }
     }
-    //else use legacy scan implementation
 
+    //else use legacy scan implementation
     using _Assigner = unseq_backend::__scan_assigner;
     using _NoAssign = unseq_backend::__scan_no_assign;
     using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
@@ -852,18 +852,18 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     _NoOpFunctor __get_data_op;
 
     return __parallel_transform_scan_base(
-        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-        std::forward<_Range2>(__out_rng), __binary_op, __init,
-        // local scan
-        unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
-                                _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
-                                                        __assign_op, __get_data_op},
-        // scan between groups
-        unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                                _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-            __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-        // global scan
-        unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
+            __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
+            ::std::forward<_Range2>(__out_rng), __binary_op, __init,
+            // local scan
+            unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
+                                  _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
+                                                           __assign_op, __get_data_op},
+            // scan between groups
+            unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
+                                  _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+                __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
+            // global scan
+            unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
 }
 
 template <typename _SizeType>

From 165b1a5010694291582a49f2c49d2cc9d60c6867 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 7 Aug 2024 10:16:17 -0400
Subject: [PATCH 40/88] removing unnecessary decay

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index f02c22c660f..6f4c644c66f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -764,7 +764,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
     // the block carry-out within a single kernel.
-    __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _ValueType> __result_and_scratch{
+    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{
         __exec, __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations

From b9f0f4ef2d24ead52622ca4bc3791dc20d12f16b Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 7 Aug 2024 15:34:27 -0400
Subject: [PATCH 41/88] removing unused forwarding references

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 702ecc3f654..c06c4e90292 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -771,7 +771,7 @@ struct __gen_transform_input
 {
     template <typename InRng>
     auto
-    operator()(InRng&& __in_rng, std::size_t __idx) const
+    operator()(const InRng& __in_rng, std::size_t __idx) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<InRng>;
         using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<
@@ -785,7 +785,7 @@ struct __simple_write_to_idx
 {
     template <typename _OutRng, typename ValueType>
     void
-    operator()(_OutRng&& __out_rng, std::size_t __idx, const ValueType& __v) const
+    operator()(const _OutRng& __out_rng, std::size_t __idx, const ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.

From bd144a480022e52bb144df532b5f7c252ba99510 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 8 Aug 2024 11:59:41 -0400
Subject: [PATCH 42/88] clang-formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 24 +++++++++----------
 .../parallel_backend_sycl_reduce_then_scan.h  |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c06c4e90292..7711d5db54a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -852,18 +852,18 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     _NoOpFunctor __get_data_op;
 
     return __parallel_transform_scan_base(
-            __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
-            ::std::forward<_Range2>(__out_rng), __binary_op, __init,
-            // local scan
-            unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
-                                  _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op,
-                                                           __assign_op, __get_data_op},
-            // scan between groups
-            unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                                  _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
-                __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
-            // global scan
-            unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
+        __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
+        ::std::forward<_Range2>(__out_rng), __binary_op, __init,
+        // local scan
+        unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
+                              _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
+                                                       __get_data_op},
+        // scan between groups
+        unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
+                              _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+            __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
+        // global scan
+        unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
 }
 
 template <typename _SizeType>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 6f4c644c66f..0c20ba18563 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -764,8 +764,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
     // the block carry-out within a single kernel.
-    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{
-        __exec, __num_sub_groups_global + 2};
+    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec,
+                                                                                    __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations
     using _ReduceSubmitter =

From d80905122ce98fa1ad596b7fdd0ee0b7cbd4e8b4 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 14 Aug 2024 15:22:17 -0400
Subject: [PATCH 43/88] adding comment and different threshold for different
 implementations

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 7711d5db54a..63359b47890 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -754,10 +754,9 @@ __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _E
 
 template <typename _Type>
 bool
-__group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::size_t __n_uniform)
+__group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_t __n_uniform,
+                         std::size_t __single_group_upper_limit)
 {
-    constexpr int __single_group_upper_limit = 2048;
-
     // Pessimistically only use half of the memory to take into account memory used by compiled kernel
     const ::std::size_t __max_slm_size =
         __queue.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
@@ -813,20 +812,23 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         auto __n_uniform = __n;
         if ((__n_uniform & (__n_uniform - 1)) != 0)
             __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;
+        bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec);
 
         // TODO: can we reimplement this with support for non-identities as well? We can then use in reduce-then-scan
         // for the last block if it is sufficiently small
         constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
         if constexpr (__can_use_group_scan)
         {
-            if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform))
+            // Empirically found values for reduce-then-scan and legacy scan implemetation for single wg cutoff
+            std::size_t __single_group_upper_limit = __pref_reduce_then_scan ? 2048 : 16384;
+            if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {
                 return __parallel_transform_scan_single_group(
                     __backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
                     ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-        if (oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec))
+        if (__pref_reduce_then_scan)
         {
             using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
             using _ScanInputTransform = oneapi::dpl::__internal::__no_op;

From 16477223f9f28e9fd5f5042c0aa38532dbbdd412 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 14 Aug 2024 15:30:34 -0400
Subject: [PATCH 44/88] checking is_gpu rather than !is_cpu

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 0c20ba18563..70fd53231fe 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -702,7 +702,7 @@ bool
 __prefer_reduce_then_scan(const _ExecutionPolicy& __exec)
 {
     const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
-    return (!__exec.queue().get_device().is_cpu() && __dev_has_sg32);
+    return (__exec.queue().get_device().is_gpu() && __dev_has_sg32);
 }
 
 // General scan-like algorithm helpers

From 0271b40206761d36951070d3a72a351cfda67e58 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 14 Aug 2024 15:33:27 -0400
Subject: [PATCH 45/88] use dpl_bit_ceil

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 63359b47890..3caf294eeb4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -809,9 +809,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     if constexpr (std::is_trivially_copyable_v<_Type>)
     {
         // Next power of 2 greater than or equal to __n
-        auto __n_uniform = __n;
-        if ((__n_uniform & (__n_uniform - 1)) != 0)
-            __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;
+        auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
         bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec);
 
         // TODO: can we reimplement this with support for non-identities as well? We can then use in reduce-then-scan

From 6cfc9790028685207e1323aee7dfbf9621fb69c1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 14 Aug 2024 15:35:39 -0400
Subject: [PATCH 46/88] removing bad formatting only changes (::std::)

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 3caf294eeb4..dca11b30a0b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -852,15 +852,15 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     _NoOpFunctor __get_data_op;
 
     return __parallel_transform_scan_base(
-        __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
-        ::std::forward<_Range2>(__out_rng), __binary_op, __init,
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+        std::forward<_Range2>(__out_rng), __binary_op, __init,
         // local scan
         unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
                               _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
                                                        __get_data_op},
         // scan between groups
-        unseq_backend::__scan</*inclusive=*/::std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
-                              _NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
+        unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor, _NoAssign,
+                              _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
             __binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
         // global scan
         unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});

From cc03af168277c69b760c19062eefb5ec39e54f6f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 08:20:15 -0400
Subject: [PATCH 47/88] fixing result_and_scratch_storage creation

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 70fd53231fe..ff56fb9aab3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -764,7 +764,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
     // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
     // the block carry-out within a single kernel.
-    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec,
+    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec, 1,
                                                                                     __num_sub_groups_global + 2};
 
     // Reduce and scan step implementations

From 98de25dfbaf21871fe6edbbb8cc846990240f788 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 08:21:52 -0400
Subject: [PATCH 48/88] spelling

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index dca11b30a0b..7491891d19a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -817,7 +817,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
         if constexpr (__can_use_group_scan)
         {
-            // Empirically found values for reduce-then-scan and legacy scan implemetation for single wg cutoff
+            // Empirically found values for reduce-then-scan and legacy scan implementation for single wg cutoff
             std::size_t __single_group_upper_limit = __pref_reduce_then_scan ? 2048 : 16384;
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {

From 59933c1305ea1bda5d17df1c72c8882345fc9bd5 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 12:44:01 -0400
Subject: [PATCH 49/88] fixing single pass scan KT from change to single-wg
 check

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 94d0474f402..13848b4f859 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -332,7 +332,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
     // Perform a single-work group scan if the input is small
-    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform))
+    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform, 16384))
     {
         return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
             oneapi::dpl::__internal::__device_backend_tag{},

From 94e6e977f9194bf9ce38ecde5aa1db928a05ba5d Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 12:44:16 -0400
Subject: [PATCH 50/88] clarifying comment language

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 7491891d19a..8b983655dc0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -804,20 +804,20 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
 {
     using _Type = typename _InitType::__value_type;
     // Reduce-then-scan is dependent on sycl::shift_group_right which requires the underlying type to be trivially
-    // copyable. If this is not met, then we must fallback to the legacy implementation. The single work-group implementation
-    // requires a fundamental type which must also be trivially copyable.
+    // copyable. If this is not met, then we must fallback to the multi pass scan implementation. The single 
+    // work-group implementation requires a fundamental type which must also be trivially copyable.
     if constexpr (std::is_trivially_copyable_v<_Type>)
     {
         // Next power of 2 greater than or equal to __n
         auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
         bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec);
 
-        // TODO: can we reimplement this with support for non-identities as well? We can then use in reduce-then-scan
-        // for the last block if it is sufficiently small
+        // TODO: Consider re-implementing single group scan to support types without known identities. This could also
+        // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
         constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
         if constexpr (__can_use_group_scan)
         {
-            // Empirically found values for reduce-then-scan and legacy scan implementation for single wg cutoff
+            // Empirically found values for reduce-then-scan and multi pass scan implementation for single wg cutoff
             std::size_t __single_group_upper_limit = __pref_reduce_then_scan ? 2048 : 16384;
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {
@@ -841,7 +841,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         }
     }
 
-    //else use legacy scan implementation
+    //else use multi pass scan implementation
     using _Assigner = unseq_backend::__scan_assigner;
     using _NoAssign = unseq_backend::__scan_no_assign;
     using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;

From ddaad557f8cec2cbc236a99a76a81a6c01aa81d1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:00:29 -0400
Subject: [PATCH 51/88] refactor subgroup scan to reduce redundant code

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index ff56fb9aab3..d5b9699d3bb 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -105,14 +105,13 @@ __inclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_f
     //return by reference __value and __init_and_carry
 }
 
-template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
-          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _MaskOp,
+          typename _InitBroadcastId, typename _SubGroup, typename _BinaryOp, typename _ValueType,
+          typename _LazyValueType>
 void
-__sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
-                 _LazyValueType& __init_and_carry)
+__sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+                        _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
 {
-    auto __mask_fn = [](auto __sub_group_local_id, auto __offset) { return __sub_group_local_id >= __offset; };
-    constexpr auto __init_broadcast_id = __sub_group_size - 1;
     if constexpr (__is_inclusive)
     {
         __inclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
@@ -125,6 +124,18 @@ __sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __
     }
 }
 
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
+          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+void
+__sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
+                 _LazyValueType& __init_and_carry)
+{
+    auto __mask_fn = [](auto __sub_group_local_id, auto __offset) { return __sub_group_local_id >= __offset; };
+    constexpr auto __init_broadcast_id = __sub_group_size - 1;
+    __sub_group_masked_scan<__sub_group_size, __is_inclusive, __init_present>(
+        __sub_group, __mask_fn, __init_broadcast_id, __value, __binary_op, __init_and_carry);
+}
+
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
           typename _BinaryOp, typename _ValueType, typename _LazyValueType, typename _SizeType>
 void
@@ -135,16 +146,8 @@ __sub_group_scan_partial(const _SubGroup& __sub_group, _ValueType& __value, _Bin
         return __sub_group_local_id >= __offset && __sub_group_local_id < __elements_to_process;
     };
     auto __init_broadcast_id = __elements_to_process - 1;
-    if constexpr (__is_inclusive)
-    {
-        __inclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
-                                                                            __value, __binary_op, __init_and_carry);
-    }
-    else
-    {
-        __exclusive_sub_group_masked_scan<__sub_group_size, __init_present>(__sub_group, __mask_fn, __init_broadcast_id,
-                                                                            __value, __binary_op, __init_and_carry);
-    }
+    __sub_group_masked_scan<__sub_group_size, __is_inclusive, __init_present>(
+        __sub_group, __mask_fn, __init_broadcast_id, __value, __binary_op, __init_and_carry);
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,

From 1fc0f59efbf121d99a68b531cf1c40770335f0b7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:07:30 -0400
Subject: [PATCH 52/88] refactoring full block / full thread logic to remove
 redundancy

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 51 +++++++++----------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d5b9699d3bb..f8f6e09de0b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -163,9 +163,8 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
 {
     bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
     bool __is_full_thread = __subgroup_start_idx + __iters_per_item * __sub_group_size <= __n;
-    if (__is_full_thread && __is_full_block)
+    if (__is_full_thread)
     {
-        // For full block and full thread, we can unroll the loop
         auto __v = __gen_input(__in_rng, __start_idx);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
@@ -174,38 +173,34 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
             __write_op(__out_rng, __start_idx, __v);
         }
 
-        _ONEDPL_PRAGMA_UNROLL
-        for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
+        if (__is_full_block)
         {
-            __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
-            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
-                __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
-            if constexpr (__capture_output)
+            // For full block and full thread, we can unroll the loop
+            _ONEDPL_PRAGMA_UNROLL
+            for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
             {
-                __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+                __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                    __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
+                if constexpr (__capture_output)
+                {
+                    __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                }
             }
         }
-    }
-    else if (__is_full_thread)
-    {
-        // For full thread but not full block, we can't unroll the loop, but we
-        // can proceed without special casing for partial subgroups.
-
-        auto __v = __gen_input(__in_rng, __start_idx);
-        __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
-                                                                           __binary_op, __sub_group_carry);
-        if constexpr (__capture_output)
-        {
-            __write_op(__out_rng, __start_idx, __v);
-        }
-        for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
+        else
         {
-            __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
-            __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
-                __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
-            if constexpr (__capture_output)
+            // For full thread but not full block, we can't unroll the loop, but we
+            // can proceed without special casing for partial subgroups.
+            for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
             {
-                __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+                __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
+                    __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
+                if constexpr (__capture_output)
+                {
+                    __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                }
             }
         }
     }

From a5753d062599467e11c8f264bdffe3b85f52282a Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:10:13 -0400
Subject: [PATCH 53/88] passing storage container by ref

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index f8f6e09de0b..fe13cd97802 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -283,7 +283,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
     auto
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
-               _TmpStorageAcc __scratch_container, const sycl::event& __prior_event,
+               _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
                const std::size_t __block_num) const
     {
@@ -429,7 +429,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
     template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
     auto
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
-               _TmpStorageAcc __scratch_container, const sycl::event& __prior_event,
+               _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
                const std::size_t __block_num) const
     {

From 761ec51042da1ae377e2dff620a892a9755ae36b Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:13:34 -0400
Subject: [PATCH 54/88] __g -> __group_id

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index fe13cd97802..13e661570c2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -297,14 +297,14 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                                             *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
                                                                __sub_group_size)]] {
                 auto __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
-                auto __g = __ndi.get_group(0);
+                auto __group_id = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
                 auto __sub_group_id = __sub_group.get_group_linear_id();
                 auto __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_idx =
-                    (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
+                    (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
@@ -336,7 +336,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 // to compute a prefix sum on global carries
                 if (__sub_group_id == 0)
                 {
-                    __start_idx = (__g * __num_sub_groups_local);
+                    __start_idx = (__group_id * __num_sub_groups_local);
                     std::uint8_t __iters =
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     if (__iters == 1)
@@ -451,13 +451,13 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 auto __res_ptr =
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
                 auto __lid = __ndi.get_local_id(0);
-                auto __g = __ndi.get_group(0);
+                auto __group_id = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
                 auto __sub_group_id = __sub_group.get_group_linear_id();
                 auto __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 auto __group_start_idx =
-                    (__block_num * __max_block_size) + (__g * __inputs_per_sub_group * __num_sub_groups_local);
+                    (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __elements_in_group =
                     std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
@@ -486,7 +486,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     //           S: sum(T0 carry...TS carry)
                     std::uint8_t __iters =
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
-                    auto __subgroups_before_my_group = __g * __num_sub_groups_local;
+                    auto __subgroups_before_my_group = __group_id * __num_sub_groups_local;
                     std::uint8_t __i = 0;
                     for (; __i < __iters - 1; __i++)
                     {
@@ -504,7 +504,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     //         memory accesses: gather(63, 127, 191, 255, ...)
                     std::uint32_t __offset = __num_sub_groups_local - 1;
                     // only need 32 carries for WGs0..WG32, 64 for WGs32..WGs64, etc.
-                    if (__g > 0)
+                    if (__group_id > 0)
                     {
                         // only need the last element from each scan of num_sub_groups_local subgroup reductions
                         const auto __elements_to_process = __subgroups_before_my_group / __num_sub_groups_local;
@@ -595,7 +595,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
-                    else if (__g > 0)
+                    else if (__group_id > 0)
                     {
                         auto __value = __sub_group_partials[__active_subgroups];
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
@@ -625,7 +625,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                            : __sub_group_partials[__active_subgroups - 1];
                         __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __value));
                     }
-                    else if (__g > 0)
+                    else if (__group_id > 0)
                     {
                         __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr),
                                                               __sub_group_partials[__active_subgroups]));
@@ -660,7 +660,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 }
                 //If within the last active group and subgroup of the block, use the 0th work item of the subgroup
                 // to write out the last carry out for either the return value or the next block
-                if (__sub_group_local_id == 0 && (__active_groups == __g + 1) &&
+                if (__sub_group_local_id == 0 && (__active_groups == __group_id + 1) &&
                     (__active_subgroups == __sub_group_id + 1))
                 {
                     if (__block_num + 1 == __num_blocks)

From 4d8c92d817386d20ff938a455af6eb51676a1192 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:14:26 -0400
Subject: [PATCH 55/88] __group_start_idx -> __group_start_id

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 13e661570c2..c86a7450f28 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -303,14 +303,14 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 auto __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
-                std::size_t __group_start_idx =
+                std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __elements_in_group =
-                    std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                    std::min(__n - __group_start_id, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
-                std::size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
+                std::size_t __subgroup_start_idx = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
 
                 std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
 
@@ -456,11 +456,11 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 auto __sub_group_id = __sub_group.get_group_linear_id();
                 auto __sub_group_local_id = __sub_group.get_local_linear_id();
 
-                auto __group_start_idx =
+                auto __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __elements_in_group =
-                    std::min(__n - __group_start_idx, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                    std::min(__n - __group_start_id, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
@@ -637,7 +637,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 }
 
                 // step 5) apply global carries
-                std::size_t __subgroup_start_idx = __group_start_idx + (__sub_group_id * __inputs_per_sub_group);
+                std::size_t __subgroup_start_idx = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
                 std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
 
                 if (__sub_group_carry_initialized)

From 55db83e0d7065579bc6ef4c49b96cc5b22305d08 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:21:26 -0400
Subject: [PATCH 56/88] minor variable naming and helpers

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 72 ++++++++++---------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index c86a7450f28..1e581fb60f2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -157,20 +157,20 @@ void
 __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
                                _LazyValueType& __sub_group_carry, _InRng __in_rng, _OutRng __out_rng,
-                               std::size_t __start_idx, std::size_t __n, std::uint32_t __iters_per_item,
-                               std::size_t __subgroup_start_idx, std::uint32_t __sub_group_id,
+                               std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item,
+                               std::size_t __subgroup_start_id, std::uint32_t __sub_group_id,
                                std::uint32_t __active_subgroups)
 {
     bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
-    bool __is_full_thread = __subgroup_start_idx + __iters_per_item * __sub_group_size <= __n;
+    bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread)
     {
-        auto __v = __gen_input(__in_rng, __start_idx);
+        auto __v = __gen_input(__in_rng, __start_id);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __write_op(__out_rng, __start_idx, __v);
+            __write_op(__out_rng, __start_id, __v);
         }
 
         if (__is_full_block)
@@ -179,12 +179,12 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
             _ONEDPL_PRAGMA_UNROLL
             for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
             {
-                __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -194,12 +194,12 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
             // can proceed without special casing for partial subgroups.
             for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
             {
-                __v = __gen_input(__in_rng, __start_idx + __j * __sub_group_size);
+                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_idx + __j * __sub_group_size, __v);
+                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -209,34 +209,34 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
         // For partial thread, we need to handle the partial subgroup at the end of the range
         if (__sub_group_id < __active_subgroups)
         {
-            auto __iters = oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_idx, __sub_group_size);
+            auto __iters = oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_id, __sub_group_size);
 
             if (__iters == 1)
             {
-                auto __local_idx = (__start_idx < __n) ? __start_idx : __n - 1;
+                auto __local_idx = (__start_id < __n) ? __start_id : __n - 1;
                 auto __v = __gen_input(__in_rng, __local_idx);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
-                    __n - __subgroup_start_idx);
+                    __n - __subgroup_start_id);
                 if constexpr (__capture_output)
                 {
-                    if (__start_idx < __n)
-                        __write_op(__out_rng, __start_idx, __v);
+                    if (__start_id < __n)
+                        __write_op(__out_rng, __start_id, __v);
                 }
             }
             else
             {
-                auto __v = __gen_input(__in_rng, __start_idx);
+                auto __v = __gen_input(__in_rng, __start_id);
                 __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_idx, __v);
+                    __write_op(__out_rng, __start_id, __v);
                 }
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
-                    auto __local_idx = __start_idx + __j * __sub_group_size;
+                    auto __local_idx = __start_id + __j * __sub_group_size;
                     __v = __gen_input(__in_rng, __local_idx);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
@@ -246,12 +246,12 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
                     }
                 }
 
-                auto __offset = __start_idx + (__iters - 1) * __sub_group_size;
+                auto __offset = __start_id + (__iters - 1) * __sub_group_size;
                 auto __local_idx = (__offset < __n) ? __offset : __n - 1;
                 __v = __gen_input(__in_rng, __local_idx);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
-                    __n - (__subgroup_start_idx + (__iters - 1) * __sub_group_size));
+                    __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size));
                 if constexpr (__capture_output)
                 {
                     if (__offset < __n)
@@ -306,13 +306,14 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
+                std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
                 std::size_t __elements_in_group =
-                    std::min(__n - __group_start_id, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                    std::min(__n - __group_start_id, __max_elements_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
-                std::size_t __subgroup_start_idx = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
+                std::size_t __subgroup_start_id = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
 
-                std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
+                std::size_t __start_id = __subgroup_start_id + __sub_group_local_id;
 
                 if (__sub_group_id < __active_subgroups)
                 {
@@ -322,7 +323,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
                         __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr,
-                        __sub_group_carry, __in_rng, nullptr, __start_idx, __n, __inputs_per_item, __subgroup_start_idx,
+                        __sub_group_carry, __in_rng, nullptr, __start_id, __n, __inputs_per_item, __subgroup_start_id,
                         __sub_group_id, __active_subgroups);
                     if (__sub_group_local_id == 0)
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
@@ -336,7 +337,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 // to compute a prefix sum on global carries
                 if (__sub_group_id == 0)
                 {
-                    __start_idx = (__group_id * __num_sub_groups_local);
+                    __start_id = (__group_id * __num_sub_groups_local);
                     std::uint8_t __iters =
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     if (__iters == 1)
@@ -348,7 +349,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
                         if (__sub_group_local_id < __active_subgroups)
-                            __temp_ptr[__start_idx + __sub_group_local_id] = __v;
+                            __temp_ptr[__start_id + __sub_group_local_id] = __v;
                     }
                     else
                     {
@@ -356,14 +357,14 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         auto __v = __sub_group_partials[__sub_group_local_id];
                         __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry);
-                        __temp_ptr[__start_idx + __sub_group_local_id] = __v;
+                        __temp_ptr[__start_id + __sub_group_local_id] = __v;
 
                         for (std::uint32_t __i = 1; __i < __iters - 1; __i++)
                         {
                             __v = __sub_group_partials[__i * __sub_group_size + __sub_group_local_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                                 __sub_group, __v, __reduce_op, __sub_group_carry);
-                            __temp_ptr[__start_idx + __i * __sub_group_size + __sub_group_local_id] = __v;
+                            __temp_ptr[__start_id + __i * __sub_group_size + __sub_group_local_id] = __v;
                         }
                         // If we are past the input range, then the previous value of v is passed to the sub-group scan.
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
@@ -378,7 +379,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                             __sub_group, __v, __reduce_op, __sub_group_carry,
                             __active_subgroups - ((__iters - 1) * __sub_group_size));
                         if (__proposed_idx < __num_sub_groups_local)
-                            __temp_ptr[__start_idx + __proposed_idx] = __v;
+                            __temp_ptr[__start_id + __proposed_idx] = __v;
                     }
 
                     __sub_group_carry.__destroy();
@@ -459,8 +460,9 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 auto __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
+                std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
                 std::size_t __elements_in_group =
-                    std::min(__n - __group_start_id, std::size_t(__num_sub_groups_local * __inputs_per_sub_group));
+                    std::min(__n - __group_start_id, __max_elements_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
@@ -637,8 +639,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 }
 
                 // step 5) apply global carries
-                std::size_t __subgroup_start_idx = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
-                std::size_t __start_idx = __subgroup_start_idx + __sub_group_local_id;
+                std::size_t __subgroup_start_id = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
+                std::size_t __start_id = __subgroup_start_id + __sub_group_local_id;
 
                 if (__sub_group_carry_initialized)
                 {
@@ -646,8 +648,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/true,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_idx, __n, __inputs_per_item,
-                        __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item,
+                        __subgroup_start_id, __sub_group_id, __active_subgroups);
                 }
                 else // first group first block, no subgroup carry
                 {
@@ -655,8 +657,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_idx, __n, __inputs_per_item,
-                        __subgroup_start_idx, __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item,
+                        __subgroup_start_id, __sub_group_id, __active_subgroups);
                 }
                 //If within the last active group and subgroup of the block, use the 0th work item of the subgroup
                 // to write out the last carry out for either the return value or the next block

From f3768bf197108a912f1a6056d997992082db09b5 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:27:38 -0400
Subject: [PATCH 57/88] improving comments, removing unused variable

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h    | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 1e581fb60f2..fa6f97c729d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -318,7 +318,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 if (__sub_group_id < __active_subgroups)
                 {
                     // adjust for lane-id
-                    // compute sub-group local pfix on T0..63, K samples/T, send to accumulator kernel
+                    // compute sub-group local prefix on T0..63, K samples/T, send to accumulator kernel
                     __scan_through_elements_helper<__sub_group_size, __is_inclusive,
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
@@ -451,7 +451,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 auto __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 auto __res_ptr =
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
-                auto __lid = __ndi.get_local_id(0);
                 auto __group_id = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
                 auto __sub_group_id = __sub_group.get_group_linear_id();
@@ -471,18 +470,18 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
 
                 // on the first sub-group in a work-group (assuming S subgroups in a work-group):
-                // 1. load S sub-group local carry pfix sums (T0..TS-1) to slm
+                // 1. load S sub-group local carry prefix sums (T0..TS-1) to SLM
                 // 2. load 32, 64, 96, etc. TS-1 work-group carry-outs (32 for WG num<32, 64 for WG num<64, etc.),
                 //    and then compute the prefix sum to generate global carry out
                 //    for each WG, i.e., prefix sum on TS-1 carries over all WG.
                 // 3. on each WG select the adjacent neighboring WG carry in
-                // 4. on each WG add the global carry-in to S sub-group local pfix sums to
+                // 4. on each WG add the global carry-in to S sub-group local prefix sums to
                 //    get a T-local global carry in
-                // 5. recompute T-local pfix values, add the T-local global carries,
+                // 5. recompute T-local prefix values, add the T-local global carries,
                 //    and then write back the final values to memory
                 if (__sub_group_id == 0)
                 {
-                    // step 1) load to Xe slm the WG-local S prefix sums
+                    // step 1) load to Xe SLM the WG-local S prefix sums
                     //         on WG T-local carries
                     //            0: T0 carry, 1: T0 + T1 carry, 2: T0 + T1 + T2 carry, ...
                     //           S: sum(T0 carry...TS carry)
@@ -660,7 +659,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item,
                         __subgroup_start_id, __sub_group_id, __active_subgroups);
                 }
-                //If within the last active group and subgroup of the block, use the 0th work item of the subgroup
+                // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group
                 // to write out the last carry out for either the return value or the next block
                 if (__sub_group_local_id == 0 && (__active_groups == __group_id + 1) &&
                     (__active_subgroups == __sub_group_id + 1))
@@ -671,7 +670,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     }
                     else
                     {
-                        //capture the last carry out for the next block
+                        // capture the last carry out for the next block
                         __set_block_carry_out(__block_num, __tmp_ptr, __sub_group_carry.__v);
                     }
                 }
@@ -696,7 +695,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
 };
 
 // reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup
-// operations.  We do not want to run this can on CPU targets, as they are not performant with this algorithm.
+// operations.  We do not want to run this scan on CPU targets, as they are not performant with this algorithm.
 template <typename _ExecutionPolicy>
 bool
 __prefer_reduce_then_scan(const _ExecutionPolicy& __exec)

From f1361d270dbb3242005b63a3c56d4d44729a07df Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:34:49 -0400
Subject: [PATCH 58/88] __prefer_reduce_then_scan -> __is_gpu_with_sg_32

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 2 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 8b983655dc0..6609ccfd475 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -810,7 +810,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     {
         // Next power of 2 greater than or equal to __n
         auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
-        bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__prefer_reduce_then_scan(__exec);
+        bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
 
         // TODO: Consider re-implementing single group scan to support types without known identities. This could also
         // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index fa6f97c729d..bef158512a5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -698,7 +698,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
 // operations.  We do not want to run this scan on CPU targets, as they are not performant with this algorithm.
 template <typename _ExecutionPolicy>
 bool
-__prefer_reduce_then_scan(const _ExecutionPolicy& __exec)
+__is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
 {
     const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
     return (__exec.queue().get_device().is_gpu() && __dev_has_sg32);

From b67b9874950cc1e5838983af0e19e6d554fc0b76 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 13:58:35 -0400
Subject: [PATCH 59/88] comment for temporary storage

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index bef158512a5..d4f9946afc5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -439,6 +439,9 @@ struct __parallel_reduce_then_scan_scan_submitter<
             __elements_in_block, __inputs_per_sub_group * __num_sub_groups_local);
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
+            // We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
+            //   __num_sub_groups_local for each sub-group partial from the reduce kernel +
+            //   1 element for the accumulated block-local carry-in from previous groups in the block
             __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);

From f3aec738e2362d219a468430a94d6c482a44e117 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:08:30 -0400
Subject: [PATCH 60/88] fold initial value into __carry_offset

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d4f9946afc5..6b21a463759 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -561,7 +561,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
 
                         // steps 3+4) load global carry in from neighbor work-group
                         //            and apply to local sub-group prefix carries
-                        auto __carry_offset = 0;
+                        auto __carry_offset = __sub_group_local_id;
 
                         std::uint8_t __iters =
                             oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
@@ -569,14 +569,14 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         std::uint8_t __i = 0;
                         for (; __i < __iters - 1; ++__i)
                         {
-                            __sub_group_partials[__carry_offset + __sub_group_local_id] = __reduce_op(
-                                __carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __sub_group_partials[__carry_offset] = __reduce_op(
+                                __carry_last.__v, __sub_group_partials[__carry_offset]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
                         {
-                            __sub_group_partials[__carry_offset + __sub_group_local_id] = __reduce_op(
-                                __carry_last.__v, __sub_group_partials[__carry_offset + __sub_group_local_id]);
+                            __sub_group_partials[__carry_offset] = __reduce_op(
+                                __carry_last.__v, __sub_group_partials[__carry_offset]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__sub_group_local_id == 0)

From 15d09e201016abf351a60608d5a515aa0952dd56 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:16:31 -0400
Subject: [PATCH 61/88] running tally of __reduction_scan_id

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 6b21a463759..12f02534455 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -353,33 +353,35 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     }
                     else
                     {
-                        //need to pull out first iteration tp avoid identity
-                        auto __v = __sub_group_partials[__sub_group_local_id];
+                        std::uint32_t __reduction_scan_id = __sub_group_local_id;
+                        // need to pull out first iteration tp avoid identity
+                        auto __v = __sub_group_partials[__reduction_scan_id];
                         __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry);
-                        __temp_ptr[__start_id + __sub_group_local_id] = __v;
+                        __temp_ptr[__start_id + __reduction_scan_id] = __v;
+                        __reduction_scan_id += __sub_group_size;
 
                         for (std::uint32_t __i = 1; __i < __iters - 1; __i++)
                         {
-                            __v = __sub_group_partials[__i * __sub_group_size + __sub_group_local_id];
+                            __v = __sub_group_partials[__reduction_scan_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                                 __sub_group, __v, __reduce_op, __sub_group_carry);
-                            __temp_ptr[__start_id + __i * __sub_group_size + __sub_group_local_id] = __v;
+                            __temp_ptr[__start_id + __reduction_scan_id] = __v;
+                            __reduction_scan_id += __sub_group_size;
                         }
                         // If we are past the input range, then the previous value of v is passed to the sub-group scan.
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
 
                         // else is an unused dummy value
-                        auto __proposed_idx = (__iters - 1) * __sub_group_size + __sub_group_local_id;
-                        auto __load_idx =
-                            (__proposed_idx < __num_sub_groups_local) ? __proposed_idx : (__num_sub_groups_local - 1);
+                        auto __load_id =
+                            (__reduction_scan_id < __num_sub_groups_local) ? __reduction_scan_id : (__num_sub_groups_local - 1);
 
                         __v = __sub_group_partials[__load_idx];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                             __sub_group, __v, __reduce_op, __sub_group_carry,
                             __active_subgroups - ((__iters - 1) * __sub_group_size));
-                        if (__proposed_idx < __num_sub_groups_local)
-                            __temp_ptr[__start_id + __proposed_idx] = __v;
+                        if (__reduction_scan_id < __num_sub_groups_local)
+                            __temp_ptr[__start_id + __reduction_scan_id] = __v;
                     }
 
                     __sub_group_carry.__destroy();

From 6bbe469b4fc87a1449a752ddf6f024b4a25a569e Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:25:09 -0400
Subject: [PATCH 62/88] _idx -> _id

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 12f02534455..743ce20a274 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -213,8 +213,8 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
 
             if (__iters == 1)
             {
-                auto __local_idx = (__start_id < __n) ? __start_id : __n - 1;
-                auto __v = __gen_input(__in_rng, __local_idx);
+                auto __local_id = (__start_id < __n) ? __start_id : __n - 1;
+                auto __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - __subgroup_start_id);
@@ -236,19 +236,19 @@ __scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_inp
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
-                    auto __local_idx = __start_id + __j * __sub_group_size;
-                    __v = __gen_input(__in_rng, __local_idx);
+                    auto __local_id = __start_id + __j * __sub_group_size;
+                    __v = __gen_input(__in_rng, __local_id);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                     if constexpr (__capture_output)
                     {
-                        __write_op(__out_rng, __local_idx, __v);
+                        __write_op(__out_rng, __local_id, __v);
                     }
                 }
 
                 auto __offset = __start_id + (__iters - 1) * __sub_group_size;
-                auto __local_idx = (__offset < __n) ? __offset : __n - 1;
-                __v = __gen_input(__in_rng, __local_idx);
+                auto __local_id = (__offset < __n) ? __offset : __n - 1;
+                __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size));
@@ -342,10 +342,10 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     if (__iters == 1)
                     {
-                        auto __load_idx = (__sub_group_local_id < __active_subgroups)
+                        auto __load_id = (__sub_group_local_id < __active_subgroups)
                                               ? __sub_group_local_id
                                               : (__active_subgroups - 1); // else is unused dummy value
-                        auto __v = __sub_group_partials[__load_idx];
+                        auto __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
                         if (__sub_group_local_id < __active_subgroups)
@@ -376,7 +376,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         auto __load_id =
                             (__reduction_scan_id < __num_sub_groups_local) ? __reduction_scan_id : (__num_sub_groups_local - 1);
 
-                        __v = __sub_group_partials[__load_idx];
+                        __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                             __sub_group, __v, __reduce_op, __sub_group_carry,
                             __active_subgroups - ((__iters - 1) * __sub_group_size));
@@ -519,12 +519,12 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         if (__pre_carry_iters == 1)
                         {
                             // single partial scan
-                            auto __proposed_idx = __num_sub_groups_local * __sub_group_local_id + __offset;
+                            auto __proposed_id = __num_sub_groups_local * __sub_group_local_id + __offset;
                             auto __remaining_elements = __elements_to_process;
-                            auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
-                                                       ? __proposed_idx
+                            auto __reduction_id = (__proposed_id < __subgroups_before_my_group)
+                                                       ? __proposed_id
                                                        : __subgroups_before_my_group - 1;
-                            auto __value = __tmp_ptr[__reduction_idx];
+                            auto __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/false>(__sub_group, __value, __reduce_op,
                                                                                __carry_last, __remaining_elements);
@@ -540,22 +540,22 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             // then some number of full iterations
                             for (std::uint32_t __i = 1; __i < __pre_carry_iters - 1; __i++)
                             {
-                                auto __reduction_idx = __i * __num_sub_groups_local * __sub_group_size +
+                                auto __reduction_id = __i * __num_sub_groups_local * __sub_group_size +
                                                        __num_sub_groups_local * __sub_group_local_id + __offset;
-                                __value = __tmp_ptr[__reduction_idx];
+                                __value = __tmp_ptr[__reduction_id];
                                 __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                                     __sub_group, __value, __reduce_op, __carry_last);
                             }
 
                             // final partial iteration
-                            auto __proposed_idx = (__pre_carry_iters - 1) * __num_sub_groups_local * __sub_group_size +
+                            auto __proposed_id = (__pre_carry_iters - 1) * __num_sub_groups_local * __sub_group_size +
                                                   __num_sub_groups_local * __sub_group_local_id + __offset;
                             auto __remaining_elements =
                                 __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
-                            auto __reduction_idx = (__proposed_idx < __subgroups_before_my_group)
-                                                       ? __proposed_idx
+                            auto __reduction_id = (__proposed_id < __subgroups_before_my_group)
+                                                       ? __proposed_id
                                                        : __subgroups_before_my_group - 1;
-                            __value = __tmp_ptr[__reduction_idx];
+                            __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/true>(__sub_group, __value, __reduce_op,
                                                                               __carry_last, __remaining_elements);

From a7d00dbf9f81bf6189a8b438b7ca59484458f3c2 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:27:52 -0400
Subject: [PATCH 63/88] running tally of __load_reduction_id rather than
 recalculating

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 743ce20a274..76947a80ce2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -493,16 +493,18 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     std::uint8_t __iters =
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     auto __subgroups_before_my_group = __group_id * __num_sub_groups_local;
+                    std::uint32_t __load_reduction_id = __sub_group_local_id;
                     std::uint8_t __i = 0;
                     for (; __i < __iters - 1; __i++)
                     {
-                        __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
-                            __tmp_ptr[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                        __sub_group_partials[__load_reduction_id] =
+                            __tmp_ptr[__subgroups_before_my_group + __load_reduction_id];
+                        __load_reduction_id += __sub_group_size;
                     }
-                    if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
+                    if (__load_reduction_id < __active_subgroups)
                     {
-                        __sub_group_partials[__i * __sub_group_size + __sub_group_local_id] =
-                            __tmp_ptr[__subgroups_before_my_group + __i * __sub_group_size + __sub_group_local_id];
+                        __sub_group_partials[__load_reduction_id] =
+                            __tmp_ptr[__subgroups_before_my_group + __load_reduction_id];
                     }
 
                     // step 2) load 32, 64, 96, etc. work-group carry outs on every work-group; then

From f54e298228fb9c2885d34eaf0d53086779c9fe36 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:44:07 -0400
Subject: [PATCH 64/88] running tally of __reduction_id rather than
 recalculating

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 76947a80ce2..eb8bdfdde2a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -535,29 +535,29 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         {
                             // multiple iterations
                             // first 1 full
-                            auto __value = __tmp_ptr[__num_sub_groups_local * __sub_group_local_id + __offset];
+                            std::uint32_t __reduction_id = __num_sub_groups_local * __sub_group_local_id + __offset;
+                            std::uint32_t __reduction_id_increment = __num_sub_groups_local * __sub_group_size;
+                            auto __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                                 __sub_group, __value, __reduce_op, __carry_last);
-
+                            __reduction_id += __reduction_id_increment;
                             // then some number of full iterations
                             for (std::uint32_t __i = 1; __i < __pre_carry_iters - 1; __i++)
                             {
-                                auto __reduction_id = __i * __num_sub_groups_local * __sub_group_size +
-                                                       __num_sub_groups_local * __sub_group_local_id + __offset;
                                 __value = __tmp_ptr[__reduction_id];
                                 __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
                                     __sub_group, __value, __reduce_op, __carry_last);
+                                __reduction_id += __reduction_id_increment;
                             }
 
                             // final partial iteration
-                            auto __proposed_id = (__pre_carry_iters - 1) * __num_sub_groups_local * __sub_group_size +
-                                                  __num_sub_groups_local * __sub_group_local_id + __offset;
+
                             auto __remaining_elements =
                                 __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
-                            auto __reduction_id = (__proposed_id < __subgroups_before_my_group)
-                                                       ? __proposed_id
-                                                       : __subgroups_before_my_group - 1;
-                            __value = __tmp_ptr[__reduction_id];
+                            auto __final_reduction_id = (__reduction_id < __subgroups_before_my_group)
+                                                       ? __reduction_id
+                                                       : __subgroups_before_my_group - 1; // dummy to avoid OOB
+                            __value = __tmp_ptr[__final_reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/true>(__sub_group, __value, __reduce_op,
                                                                               __carry_last, __remaining_elements);

From d11dd6ff394f2f1d209d6857dbe48d510bf7cede Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:44:26 -0400
Subject: [PATCH 65/88] comment improvement

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index eb8bdfdde2a..04ff075c3f0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -767,9 +767,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block;
     const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0);
 
-    //We need temporary storage for reductions of each sub-group (__num_sub_groups_global), and also 2 for the
-    // block carry-out.  We need two for the block carry-out to prevent a race condition between reading and writing
-    // the block carry-out within a single kernel.
+    // We need temporary storage for reductions of each sub-group (__num_sub_groups_global).
+    // Additionally, we need two elements for the block carry-out to prevent a race condition
+    // between reading and writing the block carry-out within a single kernel.
     __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __result_and_scratch{__exec, 1,
                                                                                     __num_sub_groups_global + 2};
 

From 1a29790b0eacf08a636c1298a41770ac36c3dec7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:44:45 -0400
Subject: [PATCH 66/88] refactor for readability

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 04ff075c3f0..d031e860dd4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -758,11 +758,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // reduce_then_scan kernel is not built to handle "empty".
     // These trivial end cases should be handled at a higher level.
     assert(__num_remaining > 0);
-    auto __inputs_per_sub_group =
-        __num_remaining >= __max_inputs_per_block
-            ? __max_inputs_per_block / __num_sub_groups_global
-            : std::max(__sub_group_size,
+    const std::uint32_t __max_inputs_per_subgroup = __max_inputs_per_block / __num_sub_groups_global;
+    std::uint32_t __evenly_divided_remaining_elements = std::max(__sub_group_size,
                        oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+    auto __inputs_per_sub_group = __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
     auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
     const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block;
     const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0);
@@ -823,11 +822,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)
         {
+            __evenly_divided_remaining_elements = std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
             __inputs_per_sub_group =
                 __num_remaining >= __max_inputs_per_block
-                    ? __max_inputs_per_block / __num_sub_groups_global
-                    : std::max(__sub_group_size,
-                               oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+                    ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
             __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
         }
     }

From e936e83bdfb6d1535a13c2d0d0e963ddff08fdc7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 14:53:44 -0400
Subject: [PATCH 67/88] formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d031e860dd4..77613d221fa 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -307,8 +307,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __elements_in_group =
-                    std::min(__n - __group_start_id, __max_elements_in_group);
+                std::size_t __elements_in_group = std::min(__n - __group_start_id, __max_elements_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 std::size_t __subgroup_start_id = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
@@ -343,8 +342,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     if (__iters == 1)
                     {
                         auto __load_id = (__sub_group_local_id < __active_subgroups)
-                                              ? __sub_group_local_id
-                                              : (__active_subgroups - 1); // else is unused dummy value
+                                             ? __sub_group_local_id
+                                             : (__active_subgroups - 1); // else is unused dummy value
                         auto __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
@@ -373,8 +372,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
 
                         // else is an unused dummy value
-                        auto __load_id =
-                            (__reduction_scan_id < __num_sub_groups_local) ? __reduction_scan_id : (__num_sub_groups_local - 1);
+                        auto __load_id = (__reduction_scan_id < __num_sub_groups_local) ? __reduction_scan_id
+                                                                                        : (__num_sub_groups_local - 1);
 
                         __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
@@ -465,8 +464,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __elements_in_group =
-                    std::min(__n - __group_start_id, __max_elements_in_group);
+                std::size_t __elements_in_group = std::min(__n - __group_start_id, __max_elements_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
@@ -524,8 +522,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             auto __proposed_id = __num_sub_groups_local * __sub_group_local_id + __offset;
                             auto __remaining_elements = __elements_to_process;
                             auto __reduction_id = (__proposed_id < __subgroups_before_my_group)
-                                                       ? __proposed_id
-                                                       : __subgroups_before_my_group - 1;
+                                                      ? __proposed_id
+                                                      : __subgroups_before_my_group - 1;
                             auto __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/false>(__sub_group, __value, __reduce_op,
@@ -555,8 +553,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             auto __remaining_elements =
                                 __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
                             auto __final_reduction_id = (__reduction_id < __subgroups_before_my_group)
-                                                       ? __reduction_id
-                                                       : __subgroups_before_my_group - 1; // dummy to avoid OOB
+                                                            ? __reduction_id
+                                                            : __subgroups_before_my_group - 1; // dummy to avoid OOB
                             __value = __tmp_ptr[__final_reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/true>(__sub_group, __value, __reduce_op,
@@ -573,14 +571,14 @@ struct __parallel_reduce_then_scan_scan_submitter<
                         std::uint8_t __i = 0;
                         for (; __i < __iters - 1; ++__i)
                         {
-                            __sub_group_partials[__carry_offset] = __reduce_op(
-                                __carry_last.__v, __sub_group_partials[__carry_offset]);
+                            __sub_group_partials[__carry_offset] =
+                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__i * __sub_group_size + __sub_group_local_id < __active_subgroups)
                         {
-                            __sub_group_partials[__carry_offset] = __reduce_op(
-                                __carry_last.__v, __sub_group_partials[__carry_offset]);
+                            __sub_group_partials[__carry_offset] =
+                                __reduce_op(__carry_last.__v, __sub_group_partials[__carry_offset]);
                             __carry_offset += __sub_group_size;
                         }
                         if (__sub_group_local_id == 0)
@@ -654,8 +652,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/true,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item,
-                        __subgroup_start_id, __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups);
                 }
                 else // first group first block, no subgroup carry
                 {
@@ -663,8 +661,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item,
-                        __subgroup_start_id, __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups);
                 }
                 // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group
                 // to write out the last carry out for either the return value or the next block
@@ -759,9 +757,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // These trivial end cases should be handled at a higher level.
     assert(__num_remaining > 0);
     const std::uint32_t __max_inputs_per_subgroup = __max_inputs_per_block / __num_sub_groups_global;
-    std::uint32_t __evenly_divided_remaining_elements = std::max(__sub_group_size,
-                       oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
-    auto __inputs_per_sub_group = __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
+    std::uint32_t __evenly_divided_remaining_elements =
+        std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+    auto __inputs_per_sub_group =
+        __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
     auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
     const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block;
     const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0);
@@ -822,10 +821,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)
         {
-            __evenly_divided_remaining_elements = std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
-            __inputs_per_sub_group =
-                __num_remaining >= __max_inputs_per_block
-                    ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
+            __evenly_divided_remaining_elements = std::max(
+                __sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+            __inputs_per_sub_group = __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup
+                                                                               : __evenly_divided_remaining_elements;
             __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
         }
     }

From 1b4f365191e9685b2ae9497e4dfb54c139ebbfd7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 15:00:48 -0400
Subject: [PATCH 68/88] removing extra space

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 6609ccfd475..61b799ffc63 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -804,7 +804,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
 {
     using _Type = typename _InitType::__value_type;
     // Reduce-then-scan is dependent on sycl::shift_group_right which requires the underlying type to be trivially
-    // copyable. If this is not met, then we must fallback to the multi pass scan implementation. The single 
+    // copyable. If this is not met, then we must fallback to the multi pass scan implementation. The single
     // work-group implementation requires a fundamental type which must also be trivially copyable.
     if constexpr (std::is_trivially_copyable_v<_Type>)
     {

From 0ca6f48fd2e05768c68834ead237446a045a79bb Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 15:33:26 -0400
Subject: [PATCH 69/88] rename variables for consistency

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 77613d221fa..0ddbc209285 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -306,10 +306,10 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
-                std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __elements_in_group = std::min(__n - __group_start_id, __max_elements_in_group);
+                std::size_t __max_inputs_in_group = __inputs_per_sub_group * __num_sub_groups_local;
+                std::size_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
                 std::uint32_t __active_subgroups =
-                    oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
+                    oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_group, __inputs_per_sub_group);
                 std::size_t __subgroup_start_id = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
 
                 std::size_t __start_id = __subgroup_start_id + __sub_group_local_id;
@@ -435,9 +435,9 @@ struct __parallel_reduce_then_scan_scan_submitter<
                const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
                const std::size_t __block_num) const
     {
-        std::size_t __elements_in_block = std::min(__n - __block_num * __max_block_size, std::size_t(__max_block_size));
+        std::size_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t(__max_block_size));
         std::size_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
-            __elements_in_block, __inputs_per_sub_group * __num_sub_groups_local);
+            __inputs_in_block, __inputs_per_sub_group * __num_sub_groups_local);
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
             // We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
@@ -463,10 +463,10 @@ struct __parallel_reduce_then_scan_scan_submitter<
                 auto __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
-                std::size_t __max_elements_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __elements_in_group = std::min(__n - __group_start_id, __max_elements_in_group);
+                std::size_t __max_inputs_in_group = __inputs_per_sub_group * __num_sub_groups_local;
+                std::size_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
                 std::uint32_t __active_subgroups =
-                    oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_group, __inputs_per_sub_group);
+                    oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
 
                 // propagate carry in from previous block
@@ -751,19 +751,20 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const std::size_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
     const std::size_t __n = __in_rng.size();
     const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
-    std::size_t __num_remaining = __n;
+    std::size_t __inputs_remaining = __n;
 
     // reduce_then_scan kernel is not built to handle "empty".
     // These trivial end cases should be handled at a higher level.
-    assert(__num_remaining > 0);
+    assert(__inputs_remaining > 0);
     const std::uint32_t __max_inputs_per_subgroup = __max_inputs_per_block / __num_sub_groups_global;
-    std::uint32_t __evenly_divided_remaining_elements =
-        std::max(__sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
+    std::uint32_t __evenly_divided_remaining_inputs = std::max(
+        __sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__inputs_remaining) / __num_sub_groups_global);
     auto __inputs_per_sub_group =
-        __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_elements;
+        __inputs_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_inputs;
     auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const auto __block_size = (__num_remaining < __max_inputs_per_block) ? __num_remaining : __max_inputs_per_block;
-    const auto __num_blocks = __num_remaining / __block_size + (__num_remaining % __block_size != 0);
+    const auto __block_size =
+        (__inputs_remaining < __max_inputs_per_block) ? __inputs_remaining : __max_inputs_per_block;
+    const auto __num_blocks = __inputs_remaining / __block_size + (__inputs_remaining % __block_size != 0);
 
     // We need temporary storage for reductions of each sub-group (__num_sub_groups_global).
     // Additionally, we need two elements for the block carry-out to prevent a race condition
@@ -804,11 +805,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // with sufficiently large L2 / L3 caches.
     for (std::size_t __b = 0; __b < __num_blocks; ++__b)
     {
-        auto __elements_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
-            std::min(__num_remaining, __max_inputs_per_block), __inputs_per_item);
-        auto __ele_in_block_round_up_workgroup =
-            oneapi::dpl::__internal::__dpl_ceiling_div(__elements_in_block, __work_group_size) * __work_group_size;
-        auto __global_range = sycl::range<1>(__ele_in_block_round_up_workgroup);
+        auto __inputs_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
+            std::min(__inputs_remaining, __max_inputs_per_block), __inputs_per_item);
+        auto __inputs_in_block_round_up_workgroup =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_block, __work_group_size) * __work_group_size;
+        auto __global_range = sycl::range<1>(__inputs_in_block_round_up_workgroup);
         auto __local_range = sycl::range<1>(__work_group_size);
         auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
@@ -817,14 +818,15 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
         __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
                                    __inputs_per_sub_group, __inputs_per_item, __b);
-        __num_remaining -= std::min(__num_remaining, __block_size);
+        __inputs_remaining -= std::min(__inputs_remaining, __block_size);
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)
         {
-            __evenly_divided_remaining_elements = std::max(
-                __sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__num_remaining) / __num_sub_groups_global);
-            __inputs_per_sub_group = __num_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup
-                                                                               : __evenly_divided_remaining_elements;
+            __evenly_divided_remaining_inputs =
+                std::max(__sub_group_size,
+                         oneapi::dpl::__internal::__dpl_bit_ceil(__inputs_remaining) / __num_sub_groups_global);
+            __inputs_per_sub_group = __inputs_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup
+                                                                                  : __evenly_divided_remaining_inputs;
             __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
         }
     }

From df6a2237d0a4b37bfddb9e7b5add460bea23d5e0 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 15 Aug 2024 16:19:12 -0400
Subject: [PATCH 70/88] fixing misleading names

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 0ddbc209285..e4036acb04d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -805,11 +805,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // with sufficiently large L2 / L3 caches.
     for (std::size_t __b = 0; __b < __num_blocks; ++__b)
     {
-        auto __inputs_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
+        auto __workitems_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
             std::min(__inputs_remaining, __max_inputs_per_block), __inputs_per_item);
-        auto __inputs_in_block_round_up_workgroup =
-            oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_block, __work_group_size) * __work_group_size;
-        auto __global_range = sycl::range<1>(__inputs_in_block_round_up_workgroup);
+        auto __workitems_in_block_round_up_workgroup =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__workitems_in_block, __work_group_size) * __work_group_size;
+        auto __global_range = sycl::range<1>(__workitems_in_block_round_up_workgroup);
         auto __local_range = sycl::range<1>(__work_group_size);
         auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.

From 6e470e5253da287868d54d39acd36b905e2e35cf Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 19 Aug 2024 13:49:21 -0400
Subject: [PATCH 71/88] Address reviewer feedback

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h     |  2 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h  |  6 +++---
 .../parallel_backend_sycl_reduce_then_scan.h   | 18 +++++++-----------
 .../hetero/dpcpp/parallel_backend_sycl_utils.h | 16 ++++++++--------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 13848b4f859..635ed869132 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -332,7 +332,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
     // Perform a single-work group scan if the input is small
-    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform, 16384))
+    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform, /*limit=*/16384))
     {
         return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
             oneapi::dpl::__internal::__device_backend_tag{},
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 61b799ffc63..c40babf7c53 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -810,7 +810,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     {
         // Next power of 2 greater than or equal to __n
         auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
-        bool __pref_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
+        bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
 
         // TODO: Consider re-implementing single group scan to support types without known identities. This could also
         // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
@@ -818,7 +818,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         if constexpr (__can_use_group_scan)
         {
             // Empirically found values for reduce-then-scan and multi pass scan implementation for single wg cutoff
-            std::size_t __single_group_upper_limit = __pref_reduce_then_scan ? 2048 : 16384;
+            std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {
                 return __parallel_transform_scan_single_group(
@@ -826,7 +826,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                     ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-        if (__pref_reduce_then_scan)
+        if (__use_reduce_then_scan)
         {
             using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
             using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index e4036acb04d..d7ef4f9477e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -293,9 +293,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
             __cgh.depends_on(__prior_event);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
             auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=,
-                                                            *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
-                                                               __sub_group_size)]] {
+            __cgh.parallel_for<_KernelName...>(
+                    __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
                 auto __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 auto __group_id = __ndi.get_group(0);
                 auto __sub_group = __ndi.get_sub_group();
@@ -449,9 +448,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
             auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
             auto __res_acc = __scratch_container.__get_result_acc(__cgh);
 
-            __cgh.parallel_for<_KernelName...>(__nd_range, [=,
-                                                            *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(
-                                                               __sub_group_size)]] {
+            __cgh.parallel_for<_KernelName...>(
+                    __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
                 auto __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 auto __res_ptr =
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
@@ -691,7 +689,6 @@ struct __parallel_reduce_then_scan_scan_submitter<
     const std::size_t __num_blocks;
     const std::size_t __n;
 
-    const _GenReduceInput __gen_reduce_input;
     const _ReduceOp __reduce_op;
     const _GenScanInput __gen_scan_input;
     const _ScanInputTransform __scan_input_transform;
@@ -777,9 +774,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
                                                      _GenReduceInput, _ReduceOp, _InitType, _ReduceKernel>;
     using _ScanSubmitter =
-        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive,
-                                                   _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform,
-                                                   _WriteOp, _InitType, _ScanKernel>;
+        __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, _ReduceOp,
+                                                   _GenScanInput, _ScanInputTransform, _WriteOp, _InitType,
+                                                   _ScanKernel>;
     _ReduceSubmitter __reduce_submitter{__max_inputs_per_block,
                                         __num_sub_groups_local,
                                         __num_sub_groups_global,
@@ -794,7 +791,6 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
                                     __num_work_items,
                                     __num_blocks,
                                     __n,
-                                    __gen_reduce_input,
                                     __reduce_op,
                                     __gen_scan_input,
                                     __scan_input_transform,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 39faf4b6750..56e4fb4abf4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -95,6 +95,14 @@ __max_compute_units(const _ExecutionPolicy& __policy)
     return __policy.queue().get_device().template get_info<sycl::info::device::max_compute_units>();
 }
 
+template <typename _ExecutionPolicy>
+bool
+__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
+{
+    const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
+    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) != __subgroup_sizes.end();
+}
+
 //-----------------------------------------------------------------------------
 // Kernel run-time information helpers
 //-----------------------------------------------------------------------------
@@ -814,14 +822,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
-template <typename _ExecutionPolicy>
-bool
-__supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
-{
-    const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
-    return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) != __subgroup_sizes.end();
-}
-
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi

From 528e04a58d593c323d548907db3edf391b263c71 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Mon, 19 Aug 2024 13:57:02 -0400
Subject: [PATCH 72/88] fix bugs from 6e470e5253] Signed-off-by: Dan Hoeflinger
 <dan.hoeflinger@intel.com>

---
 .../parallel_backend_sycl_reduce_then_scan.h  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d7ef4f9477e..e855bb5438c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -400,17 +400,17 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     _InitType __init;
 };
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
-          typename _WriteOp, typename _InitType, typename _KernelName>
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+          typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
+          typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
-          typename _GenReduceInput, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
-          typename _WriteOp, typename _InitType, typename... _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter<
-    __sub_group_size, __max_inputs_per_item, __is_inclusive, _GenReduceInput, _ReduceOp, _GenScanInput,
-    _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
+template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+          typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
+          typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, _ReduceOp,
+                                                  _GenScanInput, _ScanInputTransform, _WriteOp, _InitType,
+                                                  __internal::__optional_kernel_name<_KernelName...>>
 {
 
     template <typename _TmpPtr>
@@ -702,7 +702,7 @@ template <typename _ExecutionPolicy>
 bool
 __is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
 {
-    const bool __dev_has_sg32 = __par_backend_hetero::__supports_sub_group_size(__exec, 32);
+    const bool __dev_has_sg32 = oneapi::dpl::__internal::__supports_sub_group_size(__exec, 32);
     return (__exec.queue().get_device().is_gpu() && __dev_has_sg32);
 }
 

From 8104f1f4a98fba4382422b14f8ee14a721f2c51a Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 19 Aug 2024 15:16:52 -0500
Subject: [PATCH 73/88] Simplify conversions in __gen_transform_input

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h   | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c40babf7c53..03a4cb48b4e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -768,14 +768,15 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_
 template <typename _UnaryOp>
 struct __gen_transform_input
 {
-    template <typename InRng>
+    template <typename _InRng>
     auto
-    operator()(const InRng& __in_rng, std::size_t __idx) const
+    operator()(const _InRng& __in_rng, std::size_t __idx) const
     {
-        using _ValueType = oneapi::dpl::__internal::__value_t<InRng>;
-        using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<
-            typename std::invoke_result<_UnaryOp, _ValueType>::type>;
-        return _OutValueType{__unary_op(__in_rng[__idx])};
+        // We explicitly convert __in_rng[__idx] to the value type of _InRng to properly handle the case where we
+        // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller
+        // from modifying the input range when altering the return of this functor.
+        using _ValueType = oneapi::dpl::__internal::__value_t<_InRng>;
+        return __unary_op(_ValueType{__in_rng[__idx]});
     }
     _UnaryOp __unary_op;
 };

From a5367d1e3c3e4e76edeeb0faaa7cb349607c6198 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 19 Aug 2024 16:04:51 -0700
Subject: [PATCH 74/88] Move def of __n_uniform closer to its use

---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 03a4cb48b4e..5cc82c2ab20 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -809,8 +809,6 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     // work-group implementation requires a fundamental type which must also be trivially copyable.
     if constexpr (std::is_trivially_copyable_v<_Type>)
     {
-        // Next power of 2 greater than or equal to __n
-        auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
         bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
 
         // TODO: Consider re-implementing single group scan to support types without known identities. This could also
@@ -818,6 +816,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
         if constexpr (__can_use_group_scan)
         {
+            // Next power of 2 greater than or equal to __n
+            auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
+
             // Empirically found values for reduce-then-scan and multi pass scan implementation for single wg cutoff
             std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))

From 6096e7a2c80bf05acf6ebbd72e8a92930aeb50c0 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 21 Aug 2024 07:58:25 -0700
Subject: [PATCH 75/88] Add alias for __dpl_sycl::__sub_group and replace
 templates

---
 .../parallel_backend_sycl_reduce_then_scan.h  | 24 +++++++++----------
 .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h  |  8 +++++++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index e855bb5438c..3b26418416a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -36,9 +36,9 @@ namespace __par_backend_hetero
 {
 
 template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
-          typename _SubGroup, typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+           typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__exclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+__exclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
                                   _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
 {
     std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
@@ -77,9 +77,9 @@ __exclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_f
 }
 
 template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
-          typename _SubGroup, typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+           typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__inclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+__inclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
                                   _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
 {
     std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
@@ -106,10 +106,10 @@ __inclusive_sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_f
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _MaskOp,
-          typename _InitBroadcastId, typename _SubGroup, typename _BinaryOp, typename _ValueType,
+          typename _InitBroadcastId,  typename _BinaryOp, typename _ValueType,
           typename _LazyValueType>
 void
-__sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
+__sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
                         _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
 {
     if constexpr (__is_inclusive)
@@ -124,10 +124,10 @@ __sub_group_masked_scan(const _SubGroup& __sub_group, _MaskOp __mask_fn, _InitBr
     }
 }
 
-template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present,
           typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
+__sub_group_scan(const __dpl_sycl::__sub_group& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
                  _LazyValueType& __init_and_carry)
 {
     auto __mask_fn = [](auto __sub_group_local_id, auto __offset) { return __sub_group_local_id >= __offset; };
@@ -136,10 +136,10 @@ __sub_group_scan(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __
         __sub_group, __mask_fn, __init_broadcast_id, __value, __binary_op, __init_and_carry);
 }
 
-template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _SubGroup,
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present,
           typename _BinaryOp, typename _ValueType, typename _LazyValueType, typename _SizeType>
 void
-__sub_group_scan_partial(const _SubGroup& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
+__sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
                          _LazyValueType& __init_and_carry, _SizeType __elements_to_process)
 {
     auto __mask_fn = [__elements_to_process](auto __sub_group_local_id, auto __offset) {
@@ -151,10 +151,10 @@ __sub_group_scan_partial(const _SubGroup& __sub_group, _ValueType& __value, _Bin
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
-          std::uint32_t __max_inputs_per_item, typename _SubGroup, typename _GenInput, typename _ScanInputTransform,
+          std::uint32_t __max_inputs_per_item,  typename _GenInput, typename _ScanInputTransform,
           typename _BinaryOp, typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
 void
-__scan_through_elements_helper(const _SubGroup& __sub_group, _GenInput __gen_input,
+__scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
                                _LazyValueType& __sub_group_carry, _InRng __in_rng, _OutRng __out_rng,
                                std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
index 97f206d57ff..83c44a8a07d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
@@ -59,6 +59,8 @@
 // TODO: determine which compiler configurations provide subgroup load/store
 #define _ONEDPL_SYCL_SUB_GROUP_LOAD_STORE_PRESENT false
 
+#define _ONEDPL_SYCL_SUB_GROUP_PRESENT (_ONEDPL_LIBSYCL_VERSION >= 50700)
+
 // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within
 // SYCL kernels for determining SPIR-V compilation. Using this macro on the host may lead to incorrect behavior.
 #ifndef _ONEDPL_DETECT_SPIRV_COMPILATION // Check if overridden for testing
@@ -140,6 +142,12 @@ template <typename _T>
 using __minimum = sycl::ONEAPI::minimum<_T>;
 #endif // _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT
 
+#if _ONEDPL_SYCL_SUB_GROUP_PRESENT
+using __sub_group = sycl::sub_group;
+#else
+using __sub_group = sycl::ONEAPI::sub_group;
+#endif
+
 template <typename _Buffer>
 constexpr auto
 __get_buffer_size(const _Buffer& __buffer)

From 60c851628b5b0fe55fdc6d6520cdb47d1ca69068 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 21 Aug 2024 12:27:49 -0400
Subject: [PATCH 76/88] auto -> real types and formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |   2 +-
 .../parallel_backend_sycl_reduce_then_scan.h  | 177 +++++++++---------
 .../dpcpp/parallel_backend_sycl_utils.h       |   3 +-
 .../pstl/hetero/numeric_ranges_impl_hetero.h  |   2 +-
 4 files changed, 96 insertions(+), 88 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5cc82c2ab20..1dfc2fc7313 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -817,7 +817,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         if constexpr (__can_use_group_scan)
         {
             // Next power of 2 greater than or equal to __n
-            auto __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
+            std::size_t __n_uniform = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
             // Empirically found values for reduce-then-scan and multi pass scan implementation for single wg cutoff
             std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 3b26418416a..8c1f22c281e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -36,16 +36,17 @@ namespace __par_backend_hetero
 {
 
 template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
-           typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__exclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
-                                  _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
+__exclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn,
+                                  _InitBroadcastId __init_broadcast_id, _ValueType& __value, _BinaryOp __binary_op,
+                                  _LazyValueType& __init_and_carry)
 {
     std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
     _ONEDPL_PRAGMA_UNROLL
     for (std::uint8_t __shift = 1; __shift <= __sub_group_size / 2; __shift <<= 1)
     {
-        auto __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
+        _ValueType __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
         if (__mask_fn(__sub_group_local_id, __shift))
         {
             __value = __binary_op(__partial_carry_in, __value);
@@ -77,16 +78,17 @@ __exclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _M
 }
 
 template <std::uint8_t __sub_group_size, bool __init_present, typename _MaskOp, typename _InitBroadcastId,
-           typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__inclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
-                                  _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
+__inclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn,
+                                  _InitBroadcastId __init_broadcast_id, _ValueType& __value, _BinaryOp __binary_op,
+                                  _LazyValueType& __init_and_carry)
 {
     std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
     _ONEDPL_PRAGMA_UNROLL
     for (std::uint8_t __shift = 1; __shift <= __sub_group_size / 2; __shift <<= 1)
     {
-        auto __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
+        _ValueType __partial_carry_in = sycl::shift_group_right(__sub_group, __value, __shift);
         if (__mask_fn(__sub_group_local_id, __shift))
         {
             __value = __binary_op(__partial_carry_in, __value);
@@ -106,11 +108,11 @@ __inclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _M
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _MaskOp,
-          typename _InitBroadcastId,  typename _BinaryOp, typename _ValueType,
-          typename _LazyValueType>
+          typename _InitBroadcastId, typename _BinaryOp, typename _ValueType, typename _LazyValueType>
 void
-__sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn, _InitBroadcastId __init_broadcast_id,
-                        _ValueType& __value, _BinaryOp __binary_op, _LazyValueType& __init_and_carry)
+__sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __mask_fn,
+                        _InitBroadcastId __init_broadcast_id, _ValueType& __value, _BinaryOp __binary_op,
+                        _LazyValueType& __init_and_carry)
 {
     if constexpr (__is_inclusive)
     {
@@ -124,20 +126,20 @@ __sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _MaskOp __ma
     }
 }
 
-template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present,
-          typename _BinaryOp, typename _ValueType, typename _LazyValueType>
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _BinaryOp,
+          typename _ValueType, typename _LazyValueType>
 void
 __sub_group_scan(const __dpl_sycl::__sub_group& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
                  _LazyValueType& __init_and_carry)
 {
     auto __mask_fn = [](auto __sub_group_local_id, auto __offset) { return __sub_group_local_id >= __offset; };
-    constexpr auto __init_broadcast_id = __sub_group_size - 1;
+    constexpr std::uint8_t __init_broadcast_id = __sub_group_size - 1;
     __sub_group_masked_scan<__sub_group_size, __is_inclusive, __init_present>(
         __sub_group, __mask_fn, __init_broadcast_id, __value, __binary_op, __init_and_carry);
 }
 
-template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present,
-          typename _BinaryOp, typename _ValueType, typename _LazyValueType, typename _SizeType>
+template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, typename _BinaryOp,
+          typename _ValueType, typename _LazyValueType, typename _SizeType>
 void
 __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& __value, _BinaryOp __binary_op,
                          _LazyValueType& __init_and_carry, _SizeType __elements_to_process)
@@ -145,14 +147,14 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType&
     auto __mask_fn = [__elements_to_process](auto __sub_group_local_id, auto __offset) {
         return __sub_group_local_id >= __offset && __sub_group_local_id < __elements_to_process;
     };
-    auto __init_broadcast_id = __elements_to_process - 1;
+    std::uint8_t __init_broadcast_id = __elements_to_process - 1;
     __sub_group_masked_scan<__sub_group_size, __is_inclusive, __init_present>(
         __sub_group, __mask_fn, __init_broadcast_id, __value, __binary_op, __init_and_carry);
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
-          std::uint32_t __max_inputs_per_item,  typename _GenInput, typename _ScanInputTransform,
-          typename _BinaryOp, typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
+          std::uint32_t __max_inputs_per_item, typename _GenInput, typename _ScanInputTransform, typename _BinaryOp,
+          typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
 void
 __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
@@ -161,11 +163,13 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                                std::size_t __subgroup_start_id, std::uint32_t __sub_group_id,
                                std::uint32_t __active_subgroups)
 {
+    using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>;
+
     bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
     bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread)
     {
-        auto __v = __gen_input(__in_rng, __start_id);
+        _GenInputType __v = __gen_input(__in_rng, __start_id);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
@@ -209,12 +213,13 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
         // For partial thread, we need to handle the partial subgroup at the end of the range
         if (__sub_group_id < __active_subgroups)
         {
-            auto __iters = oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_id, __sub_group_size);
+            std::size_t __iters =
+                oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_id, __sub_group_size);
 
             if (__iters == 1)
             {
-                auto __local_id = (__start_id < __n) ? __start_id : __n - 1;
-                auto __v = __gen_input(__in_rng, __local_id);
+                std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1;
+                _GenInputType __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - __subgroup_start_id);
@@ -226,7 +231,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             }
             else
             {
-                auto __v = __gen_input(__in_rng, __start_id);
+                _GenInputType __v = __gen_input(__in_rng, __start_id);
                 __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
@@ -236,7 +241,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
-                    auto __local_id = __start_id + __j * __sub_group_size;
+                    std::size_t __local_id = __start_id + __j * __sub_group_size;
                     __v = __gen_input(__in_rng, __local_id);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
@@ -246,8 +251,8 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                     }
                 }
 
-                auto __offset = __start_id + (__iters - 1) * __sub_group_size;
-                auto __local_id = (__offset < __n) ? __offset : __n - 1;
+                std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size;
+                std::size_t __local_id = (__offset < __n) ? __offset : __n - 1;
                 __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
@@ -281,7 +286,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
     template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
-    auto
+    sycl::event
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
@@ -295,11 +300,11 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
             auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
             __cgh.parallel_for<_KernelName...>(
                     __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
-                auto __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
-                auto __group_id = __ndi.get_group(0);
-                auto __sub_group = __ndi.get_sub_group();
-                auto __sub_group_id = __sub_group.get_group_linear_id();
-                auto __sub_group_local_id = __sub_group.get_local_linear_id();
+                _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
+                std::size_t __group_id = __ndi.get_group(0);
+                __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+                std::size_t __sub_group_id = __sub_group.get_group_linear_id();
+                std::size_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_id =
@@ -340,10 +345,10 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     if (__iters == 1)
                     {
-                        auto __load_id = (__sub_group_local_id < __active_subgroups)
-                                             ? __sub_group_local_id
-                                             : (__active_subgroups - 1); // else is unused dummy value
-                        auto __v = __sub_group_partials[__load_id];
+                        std::size_t __load_id = (__sub_group_local_id < __active_subgroups)
+                                                    ? __sub_group_local_id
+                                                    : (__active_subgroups - 1); // else is unused dummy value
+                        _InitValueType __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
                         if (__sub_group_local_id < __active_subgroups)
@@ -353,7 +358,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     {
                         std::uint32_t __reduction_scan_id = __sub_group_local_id;
                         // need to pull out first iteration tp avoid identity
-                        auto __v = __sub_group_partials[__reduction_scan_id];
+                        _InitValueType __v = __sub_group_partials[__reduction_scan_id];
                         __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry);
                         __temp_ptr[__start_id + __reduction_scan_id] = __v;
@@ -371,8 +376,9 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
 
                         // else is an unused dummy value
-                        auto __load_id = (__reduction_scan_id < __num_sub_groups_local) ? __reduction_scan_id
-                                                                                        : (__num_sub_groups_local - 1);
+                        std::size_t __load_id = (__reduction_scan_id < __num_sub_groups_local)
+                                                    ? __reduction_scan_id
+                                                    : (__num_sub_groups_local - 1);
 
                         __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
@@ -412,10 +418,10 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                                                   _GenScanInput, _ScanInputTransform, _WriteOp, _InitType,
                                                   __internal::__optional_kernel_name<_KernelName...>>
 {
+    using _InitValueType = typename _InitType::__value_type;
 
-    template <typename _TmpPtr>
-    auto
-    __get_block_carry_in(const std::size_t __block_num, _TmpPtr __tmp_ptr) const
+    _InitValueType
+    __get_block_carry_in(const std::size_t __block_num, _InitValueType* __tmp_ptr) const
     {
         return __tmp_ptr[__num_sub_groups_global + (__block_num % 2)];
     }
@@ -428,7 +434,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
     }
 
     template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
-    auto
+    sycl::event
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
@@ -437,7 +443,6 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
         std::size_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t(__max_block_size));
         std::size_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
             __inputs_in_block, __inputs_per_sub_group * __num_sub_groups_local);
-        using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
             // We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
             //   __num_sub_groups_local for each sub-group partial from the reduce kernel +
@@ -450,15 +455,15 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
             __cgh.parallel_for<_KernelName...>(
                     __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
-                auto __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
-                auto __res_ptr =
+                _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
+                _InitValueType* __res_ptr =
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
-                auto __group_id = __ndi.get_group(0);
-                auto __sub_group = __ndi.get_sub_group();
-                auto __sub_group_id = __sub_group.get_group_linear_id();
-                auto __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::size_t __group_id = __ndi.get_group(0);
+                __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+                std::size_t __sub_group_id = __sub_group.get_group_linear_id();
+                std::size_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
-                auto __group_start_id =
+                std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __max_inputs_in_group = __inputs_per_sub_group * __num_sub_groups_local;
@@ -488,7 +493,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     //           S: sum(T0 carry...TS carry)
                     std::uint8_t __iters =
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
-                    auto __subgroups_before_my_group = __group_id * __num_sub_groups_local;
+                    std::size_t __subgroups_before_my_group = __group_id * __num_sub_groups_local;
                     std::uint32_t __load_reduction_id = __sub_group_local_id;
                     std::uint8_t __i = 0;
                     for (; __i < __iters - 1; __i++)
@@ -511,18 +516,18 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     if (__group_id > 0)
                     {
                         // only need the last element from each scan of num_sub_groups_local subgroup reductions
-                        const auto __elements_to_process = __subgroups_before_my_group / __num_sub_groups_local;
-                        const auto __pre_carry_iters =
+                        const std::size_t __elements_to_process = __subgroups_before_my_group / __num_sub_groups_local;
+                        const std::size_t __pre_carry_iters =
                             oneapi::dpl::__internal::__dpl_ceiling_div(__elements_to_process, __sub_group_size);
                         if (__pre_carry_iters == 1)
                         {
                             // single partial scan
-                            auto __proposed_id = __num_sub_groups_local * __sub_group_local_id + __offset;
-                            auto __remaining_elements = __elements_to_process;
-                            auto __reduction_id = (__proposed_id < __subgroups_before_my_group)
-                                                      ? __proposed_id
-                                                      : __subgroups_before_my_group - 1;
-                            auto __value = __tmp_ptr[__reduction_id];
+                            std::size_t __proposed_id = __num_sub_groups_local * __sub_group_local_id + __offset;
+                            std::size_t __remaining_elements = __elements_to_process;
+                            std::size_t __reduction_id = (__proposed_id < __subgroups_before_my_group)
+                                                             ? __proposed_id
+                                                             : __subgroups_before_my_group - 1;
+                            _InitValueType __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/false>(__sub_group, __value, __reduce_op,
                                                                                __carry_last, __remaining_elements);
@@ -533,7 +538,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                             // first 1 full
                             std::uint32_t __reduction_id = __num_sub_groups_local * __sub_group_local_id + __offset;
                             std::uint32_t __reduction_id_increment = __num_sub_groups_local * __sub_group_size;
-                            auto __value = __tmp_ptr[__reduction_id];
+                            _InitValueType __value = __tmp_ptr[__reduction_id];
                             __sub_group_scan<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                                 __sub_group, __value, __reduce_op, __carry_last);
                             __reduction_id += __reduction_id_increment;
@@ -548,11 +553,12 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
                             // final partial iteration
 
-                            auto __remaining_elements =
+                            std::size_t __remaining_elements =
                                 __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
-                            auto __final_reduction_id = (__reduction_id < __subgroups_before_my_group)
-                                                            ? __reduction_id
-                                                            : __subgroups_before_my_group - 1; // dummy to avoid OOB
+                            std::size_t __final_reduction_id =
+                                (__reduction_id < __subgroups_before_my_group)
+                                    ? __reduction_id
+                                    : __subgroups_before_my_group - 1; // dummy to avoid OOB
                             __value = __tmp_ptr[__final_reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/true>(__sub_group, __value, __reduce_op,
@@ -561,7 +567,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
                         // steps 3+4) load global carry in from neighbor work-group
                         //            and apply to local sub-group prefix carries
-                        auto __carry_offset = __sub_group_local_id;
+                        std::size_t __carry_offset = __sub_group_local_id;
 
                         std::uint8_t __iters =
                             oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
@@ -593,15 +599,15 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 {
                     if (__sub_group_id > 0)
                     {
-                        auto __value = __sub_group_id - 1 < __active_subgroups
-                                           ? __sub_group_partials[__sub_group_id - 1]
-                                           : __sub_group_partials[__active_subgroups - 1];
+                        _InitValueType __value = __sub_group_id - 1 < __active_subgroups
+                                                     ? __sub_group_partials[__sub_group_id - 1]
+                                                     : __sub_group_partials[__active_subgroups - 1];
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
                     else if (__group_id > 0)
                     {
-                        auto __value = __sub_group_partials[__active_subgroups];
+                        _InitValueType __value = __sub_group_partials[__active_subgroups];
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
@@ -624,9 +630,9 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 {
                     if (__sub_group_id > 0)
                     {
-                        auto __value = __sub_group_id - 1 < __active_subgroups
-                                           ? __sub_group_partials[__sub_group_id - 1]
-                                           : __sub_group_partials[__active_subgroups - 1];
+                        _InitValueType __value = __sub_group_id - 1 < __active_subgroups
+                                                     ? __sub_group_partials[__sub_group_id - 1]
+                                                     : __sub_group_partials[__active_subgroups - 1];
                         __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __value));
                     }
                     else if (__group_id > 0)
@@ -733,7 +739,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         __reduce_then_scan_scan_kernel<_CustomName>>;
     using _ValueType = typename _InitType::__value_type;
 
-    constexpr std::size_t __sub_group_size = 32;
+    constexpr std::uint32_t __sub_group_size = 32;
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;
@@ -754,14 +760,15 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // These trivial end cases should be handled at a higher level.
     assert(__inputs_remaining > 0);
     const std::uint32_t __max_inputs_per_subgroup = __max_inputs_per_block / __num_sub_groups_global;
-    std::uint32_t __evenly_divided_remaining_inputs = std::max(
-        __sub_group_size, oneapi::dpl::__internal::__dpl_bit_ceil(__inputs_remaining) / __num_sub_groups_global);
-    auto __inputs_per_sub_group =
+    std::uint32_t __evenly_divided_remaining_inputs =
+        std::max(std::size_t{__sub_group_size},
+                 oneapi::dpl::__internal::__dpl_bit_ceil(__inputs_remaining) / __num_sub_groups_global);
+    std::uint32_t __inputs_per_sub_group =
         __inputs_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_inputs;
-    auto __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const auto __block_size =
+    std::uint32_t __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
+    const std::size_t __block_size =
         (__inputs_remaining < __max_inputs_per_block) ? __inputs_remaining : __max_inputs_per_block;
-    const auto __num_blocks = __inputs_remaining / __block_size + (__inputs_remaining % __block_size != 0);
+    const std::size_t __num_blocks = __inputs_remaining / __block_size + (__inputs_remaining % __block_size != 0);
 
     // We need temporary storage for reductions of each sub-group (__num_sub_groups_global).
     // Additionally, we need two elements for the block carry-out to prevent a race condition
@@ -801,9 +808,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // with sufficiently large L2 / L3 caches.
     for (std::size_t __b = 0; __b < __num_blocks; ++__b)
     {
-        auto __workitems_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
+        std::size_t __workitems_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
             std::min(__inputs_remaining, __max_inputs_per_block), __inputs_per_item);
-        auto __workitems_in_block_round_up_workgroup =
+        std::size_t __workitems_in_block_round_up_workgroup =
             oneapi::dpl::__internal::__dpl_ceiling_div(__workitems_in_block, __work_group_size) * __work_group_size;
         auto __global_range = sycl::range<1>(__workitems_in_block_round_up_workgroup);
         auto __local_range = sycl::range<1>(__work_group_size);
@@ -819,7 +826,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         if (__b + 2 == __num_blocks)
         {
             __evenly_divided_remaining_inputs =
-                std::max(__sub_group_size,
+                std::max(std::size_t{__sub_group_size},
                          oneapi::dpl::__internal::__dpl_bit_ceil(__inputs_remaining) / __num_sub_groups_global);
             __inputs_per_sub_group = __inputs_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup
                                                                                   : __evenly_divided_remaining_inputs;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 56e4fb4abf4..6800394196a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -99,7 +99,8 @@ template <typename _ExecutionPolicy>
 bool
 __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
 {
-    const auto __subgroup_sizes = __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
+    const std::vector<size_t> __subgroup_sizes =
+        __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
     return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) != __subgroup_sizes.end();
 }
 
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
index 831b4fdf1f4..f69454ce7f0 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
@@ -91,7 +91,7 @@ oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
                               _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive)
 {
-    auto __n = __rng1.size();
+    std::size_t __n = __rng1.size();
     if (__n == 0)
         return 0;
 

From 8121d678e590b4fbdebebbbb3e3f60367bc5a726 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 21 Aug 2024 13:24:25 -0400
Subject: [PATCH 77/88] fixing type of subgroup id returns

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 8c1f22c281e..0042b6fd443 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -303,8 +303,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 std::size_t __group_id = __ndi.get_group(0);
                 __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
-                std::size_t __sub_group_id = __sub_group.get_group_linear_id();
-                std::size_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_id =
@@ -460,8 +460,8 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
                 std::size_t __group_id = __ndi.get_group(0);
                 __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
-                std::size_t __sub_group_id = __sub_group.get_group_linear_id();
-                std::size_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);

From 48724db450ed209ec4d037d8973f0f26eb5f5a23 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 21 Aug 2024 14:06:30 -0400
Subject: [PATCH 78/88] shrinking subgroup size id types

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 0042b6fd443..11e4c08f223 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -304,7 +304,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                 std::size_t __group_id = __ndi.get_group(0);
                 __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
                 std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __sub_group_carry;
                 std::size_t __group_start_id =
@@ -461,7 +461,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 std::size_t __group_id = __ndi.get_group(0);
                 __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
                 std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
 
                 std::size_t __group_start_id =
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
@@ -739,7 +739,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         __reduce_then_scan_scan_kernel<_CustomName>>;
     using _ValueType = typename _InitType::__value_type;
 
-    constexpr std::uint32_t __sub_group_size = 32;
+    constexpr std::uint8_t __sub_group_size = 32;
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;

From 3cc61dbe2bf7e5cb676abda9973cf88f219ca572 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 21 Aug 2024 14:21:47 -0400
Subject: [PATCH 79/88] adjust type to depend on input range

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
index f69454ce7f0..e00a89441f2 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h
@@ -91,7 +91,7 @@ oneapi::dpl::__internal::__difference_t<_Range2>
 __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
                               _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive)
 {
-    std::size_t __n = __rng1.size();
+    oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng1.size();
     if (__n == 0)
         return 0;
 

From c2c7e355724d5e907c9a91d2ca605cf93497dcf7 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 22 Aug 2024 08:07:22 -0400
Subject: [PATCH 80/88] idx -> id

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 1dfc2fc7313..5254d190d64 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -770,29 +770,29 @@ struct __gen_transform_input
 {
     template <typename _InRng>
     auto
-    operator()(const _InRng& __in_rng, std::size_t __idx) const
+    operator()(const _InRng& __in_rng, std::size_t __id) const
     {
-        // We explicitly convert __in_rng[__idx] to the value type of _InRng to properly handle the case where we
+        // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we
         // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller
         // from modifying the input range when altering the return of this functor.
         using _ValueType = oneapi::dpl::__internal::__value_t<_InRng>;
-        return __unary_op(_ValueType{__in_rng[__idx]});
+        return __unary_op(_ValueType{__in_rng[__id]});
     }
     _UnaryOp __unary_op;
 };
 
-struct __simple_write_to_idx
+struct __simple_write_to_id
 {
     template <typename _OutRng, typename ValueType>
     void
-    operator()(const _OutRng& __out_rng, std::size_t __idx, const ValueType& __v) const
+    operator()(const _OutRng& __out_rng, std::size_t __id, const ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(__v)>,
-                                                               std::decay_t<decltype(__out_rng[__idx])>>::__type;
-        __out_rng[__idx] = static_cast<_ConvertedTupleType>(__v);
+                                                               std::decay_t<decltype(__out_rng[__id])>>::__type;
+        __out_rng[__id] = static_cast<_ConvertedTupleType>(__v);
     }
 };
 
@@ -832,7 +832,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
         {
             using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
             using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
-            using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_idx;
+            using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id;
 
             _GenInput __gen_transform{__unary_op};
 

From ff7b25673546ff3a6283c0335548482201f3cc30 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 22 Aug 2024 15:38:05 -0400
Subject: [PATCH 81/88] shrinking types, switch branch to min, remove double
 deref

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../parallel_backend_sycl_reduce_then_scan.h  | 78 +++++++++----------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 11e4c08f223..a04dc6665fa 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -213,7 +213,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
         // For partial thread, we need to handle the partial subgroup at the end of the range
         if (__sub_group_id < __active_subgroups)
         {
-            std::size_t __iters =
+            std::uint32_t __iters =
                 oneapi::dpl::__internal::__dpl_ceiling_div(__n - __subgroup_start_id, __sub_group_size);
 
             if (__iters == 1)
@@ -289,7 +289,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     sycl::event
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
                const std::size_t __block_num) const
     {
         using _InitValueType = typename _InitType::__value_type;
@@ -311,7 +311,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __max_inputs_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
+                std::uint32_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_group, __inputs_per_sub_group);
                 std::size_t __subgroup_start_id = __group_start_id + (__sub_group_id * __inputs_per_sub_group);
@@ -345,9 +345,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         oneapi::dpl::__internal::__dpl_ceiling_div(__active_subgroups, __sub_group_size);
                     if (__iters == 1)
                     {
-                        std::size_t __load_id = (__sub_group_local_id < __active_subgroups)
-                                                    ? __sub_group_local_id
-                                                    : (__active_subgroups - 1); // else is unused dummy value
+                        // fill with unused dummy values to avoid overruning input
+                        std::uint32_t __load_id = std::min(std::uint32_t{__sub_group_local_id}, __active_subgroups - 1);
                         _InitValueType __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/false>(
                             __sub_group, __v, __reduce_op, __sub_group_carry, __active_subgroups);
@@ -375,10 +374,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                         // If we are past the input range, then the previous value of v is passed to the sub-group scan.
                         // It does not affect the result as our sub_group_scan will use a mask to only process in-range elements.
 
-                        // else is an unused dummy value
-                        std::size_t __load_id = (__reduction_scan_id < __num_sub_groups_local)
-                                                    ? __reduction_scan_id
-                                                    : (__num_sub_groups_local - 1);
+                        // fill with unused dummy values to avoid overruning input
+                        std::uint32_t __load_id = std::min(__reduction_scan_id, __num_sub_groups_local - 1);
 
                         __v = __sub_group_partials[__load_id];
                         __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true, /*__init_present=*/true>(
@@ -395,10 +392,10 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     }
 
     // Constant parameters throughout all blocks
-    const std::size_t __max_block_size;
-    const std::size_t __num_sub_groups_local;
-    const std::size_t __num_sub_groups_global;
-    const std::size_t __num_work_items;
+    const std::uint32_t __max_block_size;
+    const std::uint32_t __num_sub_groups_local;
+    const std::uint32_t __num_sub_groups_global;
+    const std::uint32_t __num_work_items;
     const std::size_t __n;
 
     const _GenReduceInput __gen_reduce_input;
@@ -406,12 +403,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     _InitType __init;
 };
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+template <std::uint8_t __sub_group_size, std::uint8_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
           typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+template <std::uint8_t __sub_group_size, std::uint8_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
           typename... _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, _ReduceOp,
@@ -426,9 +423,10 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
         return __tmp_ptr[__num_sub_groups_global + (__block_num % 2)];
     }
 
-    template <typename _TmpPtr, typename _ValueType>
+    template <typename _ValueType>
     void
-    __set_block_carry_out(const std::size_t __block_num, _TmpPtr __tmp_ptr, const _ValueType __block_carry_out) const
+    __set_block_carry_out(const std::size_t __block_num, _InitValueType* __tmp_ptr,
+                          const _ValueType __block_carry_out) const
     {
         __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
     }
@@ -437,11 +435,11 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
     sycl::event
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
-               const std::size_t __inputs_per_sub_group, const std::size_t __inputs_per_item,
+               const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
                const std::size_t __block_num) const
     {
-        std::size_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t(__max_block_size));
-        std::size_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
+        std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size});
+        std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
             __inputs_in_block, __inputs_per_sub_group * __num_sub_groups_local);
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
             // We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
@@ -458,7 +456,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 _InitValueType* __res_ptr =
                     _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__res_acc, __num_sub_groups_global + 2);
-                std::size_t __group_id = __ndi.get_group(0);
+                std::uint32_t __group_id = __ndi.get_group(0);
                 __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
                 std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
                 std::uint8_t __sub_group_local_id = __sub_group.get_local_linear_id();
@@ -467,7 +465,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                     (__block_num * __max_block_size) + (__group_id * __inputs_per_sub_group * __num_sub_groups_local);
 
                 std::size_t __max_inputs_in_group = __inputs_per_sub_group * __num_sub_groups_local;
-                std::size_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
+                std::uint32_t __inputs_in_group = std::min(__n - __group_start_id, __max_inputs_in_group);
                 std::uint32_t __active_subgroups =
                     oneapi::dpl::__internal::__dpl_ceiling_div(__inputs_in_group, __inputs_per_sub_group);
                 oneapi::dpl::__internal::__lazy_ctor_storage<_InitValueType> __carry_last;
@@ -555,10 +553,9 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
 
                             std::size_t __remaining_elements =
                                 __elements_to_process - ((__pre_carry_iters - 1) * __sub_group_size);
+                            // fill with unused dummy values to avoid overruning input
                             std::size_t __final_reduction_id =
-                                (__reduction_id < __subgroups_before_my_group)
-                                    ? __reduction_id
-                                    : __subgroups_before_my_group - 1; // dummy to avoid OOB
+                                std::min(std::size_t{__reduction_id}, __subgroups_before_my_group - 1);
                             __value = __tmp_ptr[__final_reduction_id];
                             __sub_group_scan_partial<__sub_group_size, /*__is_inclusive=*/true,
                                                      /*__init_present=*/true>(__sub_group, __value, __reduce_op,
@@ -688,10 +685,10 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
         });
     }
 
-    const std::size_t __max_block_size;
-    const std::size_t __num_sub_groups_local;
-    const std::size_t __num_sub_groups_global;
-    const std::size_t __num_work_items;
+    const std::uint32_t __max_block_size;
+    const std::uint32_t __num_sub_groups_local;
+    const std::uint32_t __num_sub_groups_global;
+    const std::uint32_t __num_work_items;
     const std::size_t __num_blocks;
     const std::size_t __n;
 
@@ -744,16 +741,16 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     constexpr std::uint8_t __max_inputs_per_item = 128;
     constexpr bool __inclusive = _Inclusive::value;
 
-    const std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+    const std::uint32_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 8192);
 
     // TODO: Investigate potentially basing this on some scale of the number of compute units. 128 work-groups has been
     // found to be reasonable number for most devices.
-    const std::size_t __num_work_groups = 128;
-    const std::size_t __num_work_items = __num_work_groups * __work_group_size;
-    const std::size_t __num_sub_groups_local = __work_group_size / __sub_group_size;
-    const std::size_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
+    constexpr std::uint32_t __num_work_groups = 128;
+    const std::uint32_t __num_work_items = __num_work_groups * __work_group_size;
+    const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size;
+    const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
     const std::size_t __n = __in_rng.size();
-    const std::size_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
+    const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
     std::size_t __inputs_remaining = __n;
 
     // reduce_then_scan kernel is not built to handle "empty".
@@ -766,8 +763,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     std::uint32_t __inputs_per_sub_group =
         __inputs_remaining >= __max_inputs_per_block ? __max_inputs_per_subgroup : __evenly_divided_remaining_inputs;
     std::uint32_t __inputs_per_item = __inputs_per_sub_group / __sub_group_size;
-    const std::size_t __block_size =
-        (__inputs_remaining < __max_inputs_per_block) ? __inputs_remaining : __max_inputs_per_block;
+    const std::size_t __block_size = std::min(__inputs_remaining, std::size_t{__max_inputs_per_block});
     const std::size_t __num_blocks = __inputs_remaining / __block_size + (__inputs_remaining % __block_size != 0);
 
     // We need temporary storage for reductions of each sub-group (__num_sub_groups_global).
@@ -808,9 +804,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     // with sufficiently large L2 / L3 caches.
     for (std::size_t __b = 0; __b < __num_blocks; ++__b)
     {
-        std::size_t __workitems_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
-            std::min(__inputs_remaining, __max_inputs_per_block), __inputs_per_item);
-        std::size_t __workitems_in_block_round_up_workgroup =
+        std::uint32_t __workitems_in_block = oneapi::dpl::__internal::__dpl_ceiling_div(
+            std::min(__inputs_remaining, std::size_t{__max_inputs_per_block}), __inputs_per_item);
+        std::uint32_t __workitems_in_block_round_up_workgroup =
             oneapi::dpl::__internal::__dpl_ceiling_div(__workitems_in_block, __work_group_size) * __work_group_size;
         auto __global_range = sycl::range<1>(__workitems_in_block_round_up_workgroup);
         auto __local_range = sycl::range<1>(__work_group_size);

From 8a36d5a5d44c1607a0b307c2ecffd5f0e06ff7ed Mon Sep 17 00:00:00 2001
From: Adam Fidel <110841220+adamfidel@users.noreply.github.com>
Date: Thu, 22 Aug 2024 17:05:28 -0500
Subject: [PATCH 82/88] Adjust block size for reduce-then-scan based on input
 type (#1782)

---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index a04dc6665fa..678b63f3729 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -153,7 +153,7 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType&
 }
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
-          std::uint32_t __max_inputs_per_item, typename _GenInput, typename _ScanInputTransform, typename _BinaryOp,
+          std::uint16_t __max_inputs_per_item, typename _GenInput, typename _ScanInputTransform, typename _BinaryOp,
           typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
 void
 __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
@@ -273,11 +273,11 @@ class __reduce_then_scan_reduce_kernel;
 template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+template <std::size_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
-template <std::size_t __sub_group_size, std::size_t __max_inputs_per_item, bool __is_inclusive,
+template <std::size_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
                                                     _GenReduceInput, _ReduceOp, _InitType,
@@ -403,12 +403,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
     _InitType __init;
 };
 
-template <std::uint8_t __sub_group_size, std::uint8_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
           typename _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter;
 
-template <std::uint8_t __sub_group_size, std::uint8_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
           typename... _KernelName>
 struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, _ReduceOp,
@@ -737,8 +737,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     using _ValueType = typename _InitType::__value_type;
 
     constexpr std::uint8_t __sub_group_size = 32;
+    constexpr std::uint8_t __block_size_scale = std::max(1lu, sizeof(double) / sizeof(_ValueType));
     // Empirically determined maximum. May be less for non-full blocks.
-    constexpr std::uint8_t __max_inputs_per_item = 128;
+    constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale;
     constexpr bool __inclusive = _Inclusive::value;
 
     const std::uint32_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 8192);

From 9520f3c68fa4726d6d965da76a0bae9a54ff687f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 22 Aug 2024 18:13:03 -0400
Subject: [PATCH 83/88] shrinking missed types

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 678b63f3729..9d1cf3d9da1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -273,11 +273,11 @@ class __reduce_then_scan_reduce_kernel;
 template <typename... _Name>
 class __reduce_then_scan_scan_kernel;
 
-template <std::size_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter;
 
-template <std::size_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           typename _GenReduceInput, typename _ReduceOp, typename _InitType, typename... _KernelName>
 struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
                                                     _GenReduceInput, _ReduceOp, _InitType,

From 5a928fdfa6e9ee060ab30eb7082c96a0263d8c52 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 22 Aug 2024 20:56:43 -0400
Subject: [PATCH 84/88] bugfix for windows

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 9d1cf3d9da1..f686619776a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -737,7 +737,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     using _ValueType = typename _InitType::__value_type;
 
     constexpr std::uint8_t __sub_group_size = 32;
-    constexpr std::uint8_t __block_size_scale = std::max(1lu, sizeof(double) / sizeof(_ValueType));
+    constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType));
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale;
     constexpr bool __inclusive = _Inclusive::value;

From e57573f9e3f186719d4c8a073c85d75e144ed5c5 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 27 Aug 2024 12:35:32 -0400
Subject: [PATCH 85/88] fixing range types

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 2 +-
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5254d190d64..08df22f3cf3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -785,7 +785,7 @@ struct __simple_write_to_id
 {
     template <typename _OutRng, typename ValueType>
     void
-    operator()(const _OutRng& __out_rng, std::size_t __id, const ValueType& __v) const
+    operator()(_OutRng& __out_rng, std::size_t __id, const ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index f686619776a..0574af207e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -158,7 +158,7 @@ template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_presen
 void
 __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
-                               _LazyValueType& __sub_group_carry, _InRng __in_rng, _OutRng __out_rng,
+                               _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng,
                                std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item,
                                std::size_t __subgroup_start_id, std::uint32_t __sub_group_id,
                                std::uint32_t __active_subgroups)
@@ -326,8 +326,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
                         __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr,
-                        __sub_group_carry, __in_rng, nullptr, __start_id, __n, __inputs_per_item, __subgroup_start_id,
-                        __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item,
+                        __subgroup_start_id, __sub_group_id, __active_subgroups);
                     if (__sub_group_local_id == 0)
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
                     __sub_group_carry.__destroy();

From 93189b0e6dcd057cd54d9525491badf24212621d Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 29 Aug 2024 12:16:50 -0400
Subject: [PATCH 86/88] minor comments from review + formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h   |  4 ++--
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h  | 13 +++++--------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 08df22f3cf3..0ccaff3fe62 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -783,9 +783,9 @@ struct __gen_transform_input
 
 struct __simple_write_to_id
 {
-    template <typename _OutRng, typename ValueType>
+    template <typename _OutRng, typename _ValueType>
     void
-    operator()(_OutRng& __out_rng, std::size_t __id, const ValueType& __v) const
+    operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 0574af207e6..6023dfa7592 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -103,7 +103,6 @@ __inclusive_sub_group_masked_scan(const __dpl_sycl::__sub_group& __sub_group, _M
     {
         __init_and_carry.__setup(sycl::group_broadcast(__sub_group, __value, __init_broadcast_id));
     }
-
     //return by reference __value and __init_and_carry
 }
 
@@ -485,7 +484,7 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 //    and then write back the final values to memory
                 if (__sub_group_id == 0)
                 {
-                    // step 1) load to Xe SLM the WG-local S prefix sums
+                    // step 1) load to SLM the WG-local S prefix sums
                     //         on WG T-local carries
                     //            0: T0 carry, 1: T0 + T1 carry, 2: T0 + T1 + T2 carry, ...
                     //           S: sum(T0 carry...TS carry)
@@ -596,9 +595,8 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 {
                     if (__sub_group_id > 0)
                     {
-                        _InitValueType __value = __sub_group_id - 1 < __active_subgroups
-                                                     ? __sub_group_partials[__sub_group_id - 1]
-                                                     : __sub_group_partials[__active_subgroups - 1];
+                        _InitValueType __value =
+                            __sub_group_partials[std::min(__sub_group_id - 1, __active_subgroups - 1)];
                         oneapi::dpl::unseq_backend::__init_processing<_InitValueType>{}(__init, __value, __reduce_op);
                         __sub_group_carry.__setup(__value);
                     }
@@ -627,9 +625,8 @@ struct __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs
                 {
                     if (__sub_group_id > 0)
                     {
-                        _InitValueType __value = __sub_group_id - 1 < __active_subgroups
-                                                     ? __sub_group_partials[__sub_group_id - 1]
-                                                     : __sub_group_partials[__active_subgroups - 1];
+                        _InitValueType __value =
+                            __sub_group_partials[std::min(__sub_group_id - 1, __active_subgroups - 1)];
                         __sub_group_carry.__setup(__reduce_op(__get_block_carry_in(__block_num, __tmp_ptr), __value));
                     }
                     else if (__group_id > 0)

From 4e4568ec21e18464efe70734d6018e5820fc2402 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:13:11 -0400
Subject: [PATCH 87/88] Apply std:: suggestions

Co-authored-by: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h  | 4 ++--
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 0ccaff3fe62..27ce403a018 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -824,8 +824,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {
                 return __parallel_transform_scan_single_group(
-                    __backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
-                    ::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
+                    __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                    std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
         if (__use_reduce_then_scan)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 6800394196a..9bd195a80a9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -99,7 +99,7 @@ template <typename _ExecutionPolicy>
 bool
 __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_size)
 {
-    const std::vector<size_t> __subgroup_sizes =
+    const std::vector<std::size_t> __subgroup_sizes =
         __exec.queue().get_device().template get_info<sycl::info::device::sub_group_sizes>();
     return std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), __target_size) != __subgroup_sizes.end();
 }

From af82182a7987623c169fe6fa1ba28e43502331bf Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 29 Aug 2024 16:14:08 -0400
Subject: [PATCH 88/88] rounding workgroup size down to mult of subgroup size

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 6023dfa7592..ed979f77ba3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -739,7 +739,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale;
     constexpr bool __inclusive = _Inclusive::value;
 
-    const std::uint32_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 8192);
+    const std::uint32_t __max_work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 8192);
+    // Round down to nearest multiple of the subgroup size
+    const std::uint32_t __work_group_size = (__max_work_group_size / __sub_group_size) * __sub_group_size;
 
     // TODO: Investigate potentially basing this on some scale of the number of compute units. 128 work-groups has been
     // found to be reasonable number for most devices.