From c742e79cdfd3426fdbe73092091baad3b15abae9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Nov 2023 05:44:38 -0500
Subject: [PATCH 01/10] Use SequentialReductionKernel for tree-reduction as
 well

1. Renamed misspelled variable
2. If reduction_nelems is small, used SequentialReductionKernel
   for tree-reductions as it is done for atomic reduction
3. Tweak scaling down logic for moderately-sized number of elements
   to reduce.

   We should also use max_wg if the iter_nelems is very small (one),
   since choosing max_wg for large iter_nelems may lead to under-
   utilization of GPU.
---
 .../libtensor/include/kernels/reductions.hpp  | 311 +++++++++++-------
 1 file changed, 194 insertions(+), 117 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 40e5bd282d..7c40623048 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -477,11 +477,11 @@ sycl::event reduction_over_group_with_atomics_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            constexpr size_t preferrered_reductions_per_wi = 4;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -619,11 +619,11 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
                                                         result_indexer};
             ReductionIndexerT reduction_indexer{};
 
-            constexpr size_t preferrered_reductions_per_wi = 8;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -718,11 +718,11 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
                 0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
                 /* step */ static_cast<py::ssize_t>(iter_nelems)};
 
-            constexpr size_t preferrered_reductions_per_wi = 8;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -1090,15 +1090,44 @@ sycl::event reduction_over_group_temps_strided_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 4;
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            cgh.parallel_for<class reduction_seq_strided_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     constexpr size_t max_max_wg = 2048;
     size_t max_wg = std::min(
         max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -1113,7 +1142,10 @@ sycl::event reduction_over_group_temps_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -1164,13 +1196,13 @@ sycl::event reduction_over_group_temps_strided_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -1227,7 +1259,7 @@ sycl::event reduction_over_group_temps_strided_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<resTy, 1>;
@@ -1244,7 +1276,7 @@ sycl::event reduction_over_group_temps_strided_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         local_memory, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
         });
 
@@ -1255,11 +1287,10 @@ sycl::event reduction_over_group_temps_strided_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -1302,7 +1333,7 @@ sycl::event reduction_over_group_temps_strided_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<resTy, 1>;
@@ -1319,7 +1350,7 @@ sycl::event reduction_over_group_temps_strided_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             local_memory, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
             });
 
@@ -1440,15 +1471,47 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 8;
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            cgh.parallel_for<class reduction_seq_contig_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     constexpr size_t max_max_wg = 2048;
     size_t max_wg = std::min(
         max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -1466,7 +1529,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                 NoOpIndexerT{}};
             ReductionIndexerT reduction_indexer{};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -1518,13 +1584,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -1575,7 +1641,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<resTy, 1>;
@@ -1592,7 +1658,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         local_memory, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
         });
 
@@ -1603,11 +1669,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -1650,7 +1715,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<resTy, 1>;
@@ -1667,7 +1732,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             local_memory, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
             });
 
@@ -1784,15 +1849,16 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 8;
+    constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     constexpr size_t max_max_wg = 2048;
     size_t max_wg = std::min(
         max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -1811,7 +1877,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                 0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
                 /* step */ static_cast<py::ssize_t>(iter_nelems)};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -1863,13 +1932,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -1920,7 +1989,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<resTy, 1>;
@@ -1937,7 +2006,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                         arg_tp, partially_reduced_tmp, ReductionOpT(),
                         identity_val, in_out_iter_indexer, reduction_indexer,
                         local_memory, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
         });
 
@@ -1948,11 +2017,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -1995,7 +2063,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<resTy, 1>;
@@ -2012,7 +2080,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
                             temp_arg, temp2_arg, ReductionOpT(), identity_val,
                             in_out_iter_indexer, reduction_indexer,
                             local_memory, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
             });
 
@@ -3881,15 +3949,16 @@ sycl::event search_over_group_temps_strided_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 4;
+    constexpr size_t preferred_reductions_per_wi = 4;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =
         std::min(size_t(2048),
                  d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -3904,7 +3973,10 @@ sycl::event search_over_group_temps_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -3956,13 +4028,13 @@ sycl::event search_over_group_temps_strided_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -4031,7 +4103,7 @@ sycl::event search_over_group_temps_strided_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4050,7 +4122,7 @@ sycl::event search_over_group_temps_strided_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, local_memory, reduction_nelems,
-                        iter_nelems, preferrered_reductions_per_wi));
+                        iter_nelems, preferred_reductions_per_wi));
             }
         });
 
@@ -4065,11 +4137,10 @@ sycl::event search_over_group_temps_strided_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -4114,7 +4185,7 @@ sycl::event search_over_group_temps_strided_impl(
                             ReductionOpT(), identity_val, IndexOpT(),
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4135,7 +4206,7 @@ sycl::event search_over_group_temps_strided_impl(
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, local_memory,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
             });
 
@@ -4278,15 +4349,16 @@ sycl::event search_axis1_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 8;
+    constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =
         std::min(size_t(2048),
                  d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -4304,7 +4376,10 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                 NoOpIndexerT{}};
             ReductionIndexerT reduction_indexer{};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -4356,13 +4431,13 @@ sycl::event search_axis1_over_group_temps_contig_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -4425,7 +4500,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4444,7 +4519,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, local_memory, reduction_nelems,
-                        iter_nelems, preferrered_reductions_per_wi));
+                        iter_nelems, preferred_reductions_per_wi));
             }
         });
 
@@ -4459,11 +4534,10 @@ sycl::event search_axis1_over_group_temps_contig_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -4508,7 +4582,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                             ReductionOpT(), identity_val, IndexOpT(),
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4529,7 +4603,7 @@ sycl::event search_axis1_over_group_temps_contig_impl(
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, local_memory,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
             });
 
@@ -4657,15 +4731,16 @@ sycl::event search_axis0_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    constexpr size_t preferrered_reductions_per_wi = 8;
+    constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =
         std::min(size_t(2048),
                  d.get_info<sycl::info::device::max_work_group_size>() / 2);
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -4684,7 +4759,10 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                 0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
                 /* step */ static_cast<py::ssize_t>(iter_nelems)};
 
-            wg = max_wg;
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
             reductions_per_wi =
                 std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
 
@@ -4736,13 +4814,13 @@ sycl::event search_axis0_over_group_temps_contig_impl(
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -4806,7 +4884,7 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
+                        preferred_reductions_per_wi));
             }
             else {
                 using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4825,7 +4903,7 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                         partially_reduced_tmp, ReductionOpT(), identity_val,
                         IndexOpT(), idx_identity_val, in_out_iter_indexer,
                         reduction_indexer, local_memory, reduction_nelems,
-                        iter_nelems, preferrered_reductions_per_wi));
+                        iter_nelems, preferred_reductions_per_wi));
             }
         });
 
@@ -4840,11 +4918,10 @@ sycl::event search_axis0_over_group_temps_contig_impl(
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
@@ -4889,7 +4966,7 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                             ReductionOpT(), identity_val, IndexOpT(),
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, remaining_reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
+                            iter_nelems, preferred_reductions_per_wi));
                 }
                 else {
                     using SlmT = sycl::local_accessor<argTy, 1>;
@@ -4910,7 +4987,7 @@ sycl::event search_axis0_over_group_temps_contig_impl(
                             idx_identity_val, in_out_iter_indexer,
                             reduction_indexer, local_memory,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
+                            preferred_reductions_per_wi));
                 }
             });
 

From 6a0b09c1157d7c68f68ffc20738f601e29784dfd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Nov 2023 08:52:01 -0500
Subject: [PATCH 02/10] Apply SequentialReductionKernel to axis0 reduction

---
 .../libtensor/include/kernels/reductions.hpp  | 71 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 7c40623048..36aad581a6 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    {
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                               InputOutputIterIndexerT,
+                                               ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+    else {
         sycl::event res_init_ev = exec_q.fill<resTy>(
             res_tp, resTy(identity_val), iter_nelems, depends);
 
@@ -1849,6 +1883,41 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                               InputOutputIterIndexerT,
+                                               ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
     constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     constexpr size_t max_max_wg = 2048;

From f74eae0d86bbd1971661e9f022e948d4f9eb490b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Nov 2023 14:05:20 -0500
Subject: [PATCH 03/10] Split _tensor_impl into three extensions

_tensor_impl continues holding constructors, where, clip

_tensor_elementwise_impl holds elementwise functions
_tensor_reductions_impl holds reduction functions.
---
 dpctl/tensor/CMakeLists.txt                   | 68 +++++++++++++------
 .../{tensor_py.cpp => tensor_ctors.cpp}       |  8 +--
 .../libtensor/source/tensor_elementwise.cpp   | 34 ++++++++++
 .../libtensor/source/tensor_reductions.cpp    | 37 ++++++++++
 4 files changed, 120 insertions(+), 27 deletions(-)
 rename dpctl/tensor/libtensor/source/{tensor_py.cpp => tensor_ctors.cpp} (98%)
 create mode 100644 dpctl/tensor/libtensor/source/tensor_elementwise.cpp
 create mode 100644 dpctl/tensor/libtensor/source/tensor_reductions.cpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 9c02a325bc..d1de208805 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -113,10 +113,13 @@ set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
 )
+set(_boolean_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
+)
 set(_tensor_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
@@ -128,19 +131,39 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
-list(APPEND _tensor_impl_sources
-     ${_elementwise_sources}
-     ${_reduction_sources}
+set(_tensor_elementwise_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_elementwise_sources}
+)
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_boolean_reduction_sources}
+    ${_reduction_sources}
 )
 
+set(_py_trgts)
+
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_elementwise_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
@@ -170,19 +193,22 @@ if (UNIX)
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
 endif()
-target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
-target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
-if(UNIX)
-    # this option is supported on Linux only
-    target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
-endif()
-target_include_directories(${python_module_name}
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
-)
+
 set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-target_link_options(${python_module_name} PRIVATE ${_linker_options})
-add_dependencies(${python_module_name} _dpctl4pybind11_deps)
-install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
+    target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
+    if(UNIX)
+        # this option is supported on Linux only
+        target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
+    endif()
+    target_include_directories(${python_module_name}
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    add_dependencies(${python_module_name} _dpctl4pybind11_deps)
+    install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+endforeach()
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
similarity index 98%
rename from dpctl/tensor/libtensor/source/tensor_py.cpp
rename to dpctl/tensor/libtensor/source/tensor_ctors.cpp
index d07d5cf084..8db874cbe1 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -1,4 +1,5 @@
-//===-- tensor_py.cpp - Implementation of _tensor_impl module  --*-C++-*-/===//
+//===-- tensor_ctors.cpp -                                    ---*-C++-*-/===//
+//   Implementation of _tensor_impl module
 //
 //                      Data Parallel Control (dpctl)
 //
@@ -43,7 +44,6 @@
 #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
-#include "elementwise_functions/elementwise_common.hpp"
 #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
@@ -454,8 +454,4 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Returns a tuple of events: (hev, ev)",
           py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    dpctl::tensor::py_internal::init_elementwise_functions(m);
-    dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
-    dpctl::tensor::py_internal::init_reduction_functions(m);
 }
diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
new file mode 100644
index 0000000000..1a86526893
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
@@ -0,0 +1,34 @@
+//===-- tensor_elementwise.cpp                                ---*-C++-*-/===//
+//    Implementation of _tensor_elementwise_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "elementwise_functions/elementwise_common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_elementwise_impl, m)
+{
+    dpctl::tensor::py_internal::init_elementwise_functions(m);
+}
diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp
new file mode 100644
index 0000000000..138c31f3eb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_reductions.cpp
@@ -0,0 +1,37 @@
+//===-- tensor_reductions.cpp -                              --*-C++-*-/===//
+//   Implementation of _tensor_reductions_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "boolean_reductions.hpp"
+#include "reductions/reduction_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_reductions_impl, m)
+{
+    dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
+    dpctl::tensor::py_internal::init_reduction_functions(m);
+}

From 421b2703e3cad211e2715a9a3e9cbf616b1e28c3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Nov 2023 14:07:02 -0500
Subject: [PATCH 04/10] Used new native extension modules

---
 dpctl/tensor/_clip.py              |  5 +++--
 dpctl/tensor/_elementwise_funcs.py |  2 +-
 dpctl/tensor/_reduction.py         | 25 +++++++++++++------------
 dpctl/tensor/_utility_functions.py |  5 +++--
 4 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
index 5a3a96933f..eeed87b404 100644
--- a/dpctl/tensor/_clip.py
+++ b/dpctl/tensor/_clip.py
@@ -16,6 +16,7 @@
 
 import dpctl
 import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
 import dpctl.tensor._tensor_impl as ti
 from dpctl.tensor._copy_utils import (
     _empty_like_orderK,
@@ -429,9 +430,9 @@ def clip(x, min=None, max=None, out=None, order="K"):
             "only one of `min` and `max` is permitted to be `None`"
         )
     elif max is None:
-        return _clip_none(x, min, out, order, ti._maximum)
+        return _clip_none(x, min, out, order, tei._maximum)
     elif min is None:
-        return _clip_none(x, max, out, order, ti._minimum)
+        return _clip_none(x, max, out, order, tei._minimum)
     else:
         q1, x_usm_type = x.sycl_queue, x.usm_type
         q2, min_usm_type = _get_queue_usm_type(min)
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index aa5ba04b19..9879960999 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_elementwise_impl as ti
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import _acceptance_fn_divide
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index 0edc9ac12b..0cd302cccc 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -19,6 +19,7 @@
 import dpctl
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
 
 from ._type_utils import _to_device_supported_dtype
 
@@ -220,8 +221,8 @@ def sum(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._sum_over_axis,
-        ti._sum_over_axis_dtype_supported,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
         _default_reduction_dtype,
         _identity=0,
     )
@@ -281,8 +282,8 @@ def prod(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._prod_over_axis,
-        ti._prod_over_axis_dtype_supported,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
         _default_reduction_dtype,
         _identity=1,
     )
@@ -335,8 +336,8 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._logsumexp_over_axis,
-        lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported(
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
@@ -391,8 +392,8 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
         axis,
         dtype,
         keepdims,
-        ti._hypot_over_axis,
-        lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported(
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
@@ -468,7 +469,7 @@ def max(x, axis=None, keepdims=False):
             entire array, a zero-dimensional array is returned. The returned
             array has the same data type as `x`.
     """
-    return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis)
 
 
 def min(x, axis=None, keepdims=False):
@@ -496,7 +497,7 @@ def min(x, axis=None, keepdims=False):
             entire array, a zero-dimensional array is returned. The returned
             array has the same data type as `x`.
     """
-    return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis)
 
 
 def _search_over_axis(x, axis, keepdims, _reduction_fn):
@@ -577,7 +578,7 @@ def argmax(x, axis=None, keepdims=False):
             zero-dimensional array is returned. The returned array has the
             default array index data type for the device of `x`.
     """
-    return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis)
+    return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis)
 
 
 def argmin(x, axis=None, keepdims=False):
@@ -609,4 +610,4 @@ def argmin(x, axis=None, keepdims=False):
             zero-dimensional array is returned. The returned array has the
             default array index data type for the device of `x`.
     """
-    return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis)
+    return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis)
diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py
index 500c997e8f..69a1a200df 100644
--- a/dpctl/tensor/_utility_functions.py
+++ b/dpctl/tensor/_utility_functions.py
@@ -3,6 +3,7 @@
 import dpctl
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
 
 
 def _boolean_reduction(x, axis, keepdims, func):
@@ -94,7 +95,7 @@ def all(x, axis=None, keepdims=False):
             An array with a data type of `bool`
             containing the results of the logical AND reduction.
     """
-    return _boolean_reduction(x, axis, keepdims, ti._all)
+    return _boolean_reduction(x, axis, keepdims, tri._all)
 
 
 def any(x, axis=None, keepdims=False):
@@ -122,4 +123,4 @@ def any(x, axis=None, keepdims=False):
             An array with a data type of `bool`
             containing the results of the logical OR reduction.
     """
-    return _boolean_reduction(x, axis, keepdims, ti._any)
+    return _boolean_reduction(x, axis, keepdims, tri._any)

From 41ec378f44dedc6c093f4d3fb1361285cac91811 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Nov 2023 02:12:45 -0500
Subject: [PATCH 05/10] Added docstrings and getter methods for ElementwiseFunc
 classes

Added stable API to retrieve implementation functions in each elementwise
function class instance to allow `dpnp` to access that information using
stable API.
---
 dpctl/tensor/_elementwise_common.py           | 150 ++++++++++++++++++
 .../elementwise/test_elementwise_classes.py   |  80 ++++++++++
 2 files changed, 230 insertions(+)
 create mode 100644 dpctl/tests/elementwise/test_elementwise_classes.py

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index baaac078b5..fe2a77a9d5 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -39,6 +39,31 @@
 class UnaryElementwiseFunc:
     """
     Class that implements unary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovler_fn (callable):
+            Function that takes dtype of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        unary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values, effectively
+            evaluating `dst = func(src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
     """
 
     def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
@@ -55,8 +80,31 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the implementation function for
+        this elementwise unary function.
+
+        """
+        return self.unary_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise unary function.
+        """
+        return self.result_type_resolver_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.sin.types
+                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
+        """
         types = self.types_
         if not types:
             types = []
@@ -363,6 +411,56 @@ def _get_shape(o):
 class BinaryElementwiseFunc:
     """
     Class that implements binary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovle_fn (callable):
+            Function that takes dtypes of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        binary_dp_impl_fn (callable):
+            Data-parallel umplementation function with signature
+            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src1` and `src2` are the argument arrays, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(src1, src2)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
+        binary_inplace_fn (callable, optional):
+            Data-parallel omplementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(dst, src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including async lifetime management of Python arguments,
+            while the second event corresponds to computational tasks
+            associated with function evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this binary
+            function. The function takes 6 arguments:
+                arg1_dtype - Data type of the first argument
+                arg2_dtype - Data type of the second argument
+                ret_buf1_dtype - Data type the first argument would be cast to
+                ret_buf2_dtype - Data type the second argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is only called when both arguments of the binary
+            function require casting, e.g. both arguments of
+            `dpctl.tensor.logaddexp` are arrays with integral data type.
     """
 
     def __init__(
@@ -392,8 +490,60 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the out-of-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_fn_
+
+    def get_implementation_inplace_function(self):
+        """Returns the in-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_inplace_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise binary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this binary function.
+        The function takes 6 arguments:
+            arg1_dtype - Data type of the first argument
+            arg2_dtype - Data type of the second argument
+            ret_buf1_dtype - Data type the first argument would be cast to
+            ret_buf2_dtype - Data type the second argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
+                is carried out.
+
+        The acceptance function is only invoked if both input arrays must be
+        cast to intermediary data types, as would happen during call of
+        `dpctl.tensor.hypot` with both arrays being of integral data type.
+        """
+        return self.acceptance_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.divide.types
+                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
+                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
+        """
         types = self.types_
         if not types:
             types = []
diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py
new file mode 100644
index 0000000000..b7f1d26d6e
--- /dev/null
+++ b/dpctl/tests/elementwise/test_elementwise_classes.py
@@ -0,0 +1,80 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl.tensor as dpt
+
+unary_fn = dpt.negative
+binary_fn = dpt.divide
+
+
+def test_unary_class_getters():
+    fn = unary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = unary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+
+def test_unary_class_types_property():
+    loop_types = unary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_unary_class_str_repr():
+    s = str(unary_fn)
+    r = repr(unary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = unary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_binary_class_getters():
+    fn = binary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_implementation_inplace_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_promotion_path_acceptance_function()
+    assert callable(fn)
+
+
+def test_binary_class_types_property():
+    loop_types = binary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_binary_class_str_repr():
+    s = str(binary_fn)
+    r = repr(binary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = binary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r

From 645044a6b6e7f4d5b9be8535bff4d612335d4e74 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Nov 2023 05:50:20 -0500
Subject: [PATCH 06/10] Instantiate atomic reduction templates for min/max ops
 for double/float types

Added entries for float and double types to TypePairSupportDataForCompReductionAtomic
as spotted by @ndgrigorian in the PR review.

Also moved comments around.
---
 .../libtensor/include/kernels/reductions.hpp  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 36aad581a6..f3754e8820 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -2247,11 +2247,10 @@ template <typename argTy, typename outTy>
 struct TypePairSupportDataForCompReductionAtomic
 {
 
-    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
      * otherwise */
-    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
-                                                         // feature, supported
-                                                         // by DPC++
+    // disjunction is C++17 feature, supported by DPC++
+    static constexpr bool is_defined = std::disjunction<
         // input int32
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         // input uint32
@@ -2260,6 +2259,10 @@ struct TypePairSupportDataForCompReductionAtomic
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
         // input uint64
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
         // fall-through
         td_ns::NotDefinedEntry>::is_defined;
 };
@@ -2268,19 +2271,17 @@ template <typename argTy, typename outTy>
 struct TypePairSupportDataForCompReductionTemps
 {
 
-    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
-                                                         // feature, supported
-                                                         // by DPC++ input bool
+    // disjunction is C++17 feature, supported by DPC++
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
 

From 097ecf5c6af6264b56aa7b8465e4775906d0bebd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Nov 2023 05:54:16 -0500
Subject: [PATCH 07/10] Modified sycl_timer example to use dpctl.tensor
 function

This removes use of dpnp.matmul from the example, making this example
self-contained.
---
 examples/python/sycl_timer.py | 41 +++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py
index f4b1416784..8ae49fd60d 100644
--- a/examples/python/sycl_timer.py
+++ b/examples/python/sycl_timer.py
@@ -15,14 +15,27 @@
 # limitations under the License.
 
 
-import dpnp
 import numpy as np
 
 import dpctl
 import dpctl.tensor as dpt
 from dpctl import SyclTimer
 
-n = 4000
+
+def matmul(m1, m2):
+    """Naive matrix multiplication implementation"""
+    assert m1.ndim == 2
+    assert m2.ndim == 2
+    assert m1.shape[1] == m2.shape[0]
+    m1 = m1[:, dpt.newaxis, :]
+    m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :]
+    # form m_prod[i, j, k] = m1[i,k] * m2[k, j]
+    m_prods = m1 * m2
+    # sum over k
+    return dpt.sum(m_prods, axis=-1)
+
+
+n = 500
 
 try:
     q = dpctl.SyclQueue(property="enable_profiling")
@@ -33,32 +46,36 @@
     )
     exit(0)
 
-a = dpt.reshape(dpt.arange(n * n, dtype=np.float32, sycl_queue=q), (n, n))
-b = dpt.reshape(
-    dpt.asarray(np.random.random(n * n), dtype=np.float32, sycl_queue=q), (n, n)
-)
+a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q)
+a = dpt.reshape(a_flat, (n, n))
 
-timer = SyclTimer(time_scale=1)
+b_rand = np.random.random(n * n).astype(np.float32)
+b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q)
+b = dpt.reshape(b_flat, (n, n))
 
 wall_times = []
 device_times = []
+
 print(
-    f"Performing matrix multiplication of two {n} by {n} matrices "
+    f"Computing naive matrix multiplication of two {n} by {n} matrices "
     f"on {q.sycl_device.name}, repeating 5 times."
 )
+print()
 for _ in range(5):
+    timer = SyclTimer(time_scale=1)
     with timer(q):
-        a_matmul_b = dpnp.matmul(a, b)
+        a_matmul_b = matmul(a, b)
     host_time, device_time = timer.dt
     wall_times.append(host_time)
     device_times.append(device_time)
 
-c = dpnp.asnumpy(a_matmul_b)
-cc = np.dot(dpnp.asnumpy(a), dpnp.asnumpy(b))
+c = dpt.asnumpy(a_matmul_b)
+cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b))
 
 print("Wall time: ", wall_times, "\nDevice time: ", device_times)
+print()
 print(
     "Accuracy test: passed."
     if np.allclose(c, cc)
-    else (f"Accuracy test: failed. Discrepancy {np.max(np.abs(c-cc))}")
+    else (f"Accuracy test: FAILED. \n   Discrepancy = {np.max(np.abs(c-cc))}")
 )

From af28d981d513729a31349fe19cb1da10515ab849 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Fri, 3 Nov 2023 13:19:41 -0700
Subject: [PATCH 08/10] Improves performance of search reductions for small
 numbers of elements (#1464)

* Adds SequentialSearchReduction functor to search reductions

* Search reductions use correct branch for float16

constexpr branch logic accounted for floating point types but not sycl::half,
which meant NaNs were not propagating for float16 data
---
 .../libtensor/include/kernels/reductions.hpp  | 251 +++++++++++++++++-
 1 file changed, 248 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index f3754e8820..6651483c6c 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -3401,6 +3401,129 @@ struct LogSumExpOverAxis0TempsContigFactory
 
 // Argmax and Argmin
 
+/* Sequential search reduction */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialSearchReduction(const argT *inp,
+                              outT *res,
+                              ReductionOp reduction_op,
+                              const argT &identity_val,
+                              IdxReductionOp idx_reduction_op,
+                              const outT &idx_identity_val,
+                              InputOutputIterIndexerT arg_res_iter_indexer,
+                              InputRedIndexerT arg_reduced_dims_indexer,
+                              size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
+          idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const py::ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const py::ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        argT red_val(identity_);
+        outT idx_val(idx_identity_);
+        for (size_t m = 0; m < reduction_max_gid_; ++m) {
+            const py::ssize_t inp_reduction_offset =
+                inp_reduced_dims_indexer_(m);
+            const py::ssize_t inp_offset =
+                inp_iter_offset + inp_reduction_offset;
+
+            argT val = inp_[inp_offset];
+            if (val == red_val) {
+                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
+            }
+            else {
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val < red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val < red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val > red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val > red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+            }
+        }
+        out_[out_iter_offset] = idx_val;
+    }
+};
+
 /* = Search reduction using reduce_over_group*/
 
 template <typename argT,
@@ -3670,7 +3793,9 @@ struct CustomSearchReduction
                                 }
                             }
                         }
-                        else if constexpr (std::is_floating_point_v<argT>) {
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
                             if (val < local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -3714,7 +3839,9 @@ struct CustomSearchReduction
                                 }
                             }
                         }
-                        else if constexpr (std::is_floating_point_v<argT>) {
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
                             if (val > local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -3757,7 +3884,9 @@ struct CustomSearchReduction
                             ? local_idx
                             : idx_identity_;
         }
-        else if constexpr (std::is_floating_point_v<argT>) {
+        else if constexpr (std::is_floating_point_v<argT> ||
+                           std::is_same_v<argT, sycl::half>)
+        {
             // equality does not hold for NaNs, so check here
             local_idx =
                 (red_val_over_wg == local_red_val || std::isnan(local_red_val))
@@ -3799,6 +3928,14 @@ typedef sycl::event (*search_strided_impl_fn_ptr)(
     py::ssize_t,
     const std::vector<sycl::event> &);
 
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_strided_krn;
+
 template <typename T1,
           typename T2,
           typename T3,
@@ -3820,6 +3957,14 @@ template <typename T1,
           bool b2>
 class custom_search_over_group_temps_strided_krn;
 
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_contig_krn;
+
 template <typename T1,
           typename T2,
           typename T3,
@@ -4019,6 +4164,36 @@ sycl::event search_over_group_temps_strided_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            cgh.parallel_for<class search_seq_strided_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
     constexpr size_t preferred_reductions_per_wi = 4;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =
@@ -4419,6 +4594,39 @@ sycl::event search_axis1_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            cgh.parallel_for<class search_seq_contig_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
     constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =
@@ -4801,6 +5009,43 @@ sycl::event search_axis0_over_group_temps_contig_impl(
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
     constexpr size_t preferred_reductions_per_wi = 8;
     // max_max_wg prevents running out of resources on CPU
     size_t max_wg =

From eb21e50cc1ee29cf7c7e6e442d0075c3425b16e9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Nov 2023 11:01:43 -0500
Subject: [PATCH 09/10] Remove superfluous includes in tensor_ctors.cpp per PR
 review

---
 dpctl/tensor/libtensor/source/tensor_ctors.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
index 8db874cbe1..4720f6baa1 100644
--- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -37,7 +37,6 @@
 
 #include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
-#include "boolean_reductions.hpp"
 #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
@@ -48,7 +47,6 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "linear_sequences.hpp"
-#include "reductions/reduction_common.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"

From d4d499267ef3cc7a2d0c73f65d287efc589f8dfb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Nov 2023 11:03:17 -0500
Subject: [PATCH 10/10] Fixed misspelled words

---
 dpctl/tensor/_elementwise_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index fe2a77a9d5..4a0d1c451f 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -421,7 +421,7 @@ class BinaryElementwiseFunc:
             implementation functions supports it, or
             returns `None` otherwise.
         binary_dp_impl_fn (callable):
-            Data-parallel umplementation function with signature
+            Data-parallel implementation function with signature
             `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
              sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
             where the `src1` and `src2` are the argument arrays, `dst` is the
@@ -437,7 +437,7 @@ class BinaryElementwiseFunc:
         docs (str):
             Documentation string for the unary function.
         binary_inplace_fn (callable, optional):
-            Data-parallel omplementation function with signature
+            Data-parallel implementation function with signature
             `impl_fn(src: usm_ndarray, dst: usm_ndarray,
              sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
             where the `src` is the argument array, `dst` is the