From 294b2f2749f2311817250801b1c945c6f1fbd5df Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 5 Nov 2024 00:31:13 -0800
Subject: [PATCH 01/18] fixes

---
 .../vertex_partitioning_default_kernel.hpp    |   8 +-
 .../algo/louvain/backend/cpu/louvain_data.hpp |   6 +-
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../objective_function/test/fixture.hpp       |   8 +-
 .../objective_function/test/spmd_fixture.hpp  |   6 +-
 .../optimizers/test/cg_solver_dpc.cpp         |   8 +-
 .../primitives/optimizers/test/fixture.hpp    |  10 +-
 .../optimizers/test/newton_cg_dpc.cpp         |  14 +-
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp |  25 ++
 .../dal/backend/primitives/rng/rng_cpu.hpp    | 105 ++++++
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 172 ++++++++++
 .../primitives/rng/rng_engine_collection.hpp  |  77 ++---
 .../dal/backend/primitives/rng/rng_gpu.hpp    | 220 +++++++++++++
 .../backend/primitives/rng/test/rng_dpc.cpp   | 300 ++++++++++++++++++
 14 files changed, 873 insertions(+), 88 deletions(-)
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng.hpp
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp

diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index 4da1866e277..218f7da46bc 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -24,7 +24,7 @@
 #include "oneapi/dal/backend/memory.hpp"
 #include "oneapi/dal/backend/interop/common.hpp"
 #include "oneapi/dal/table/homogen.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/detail/threading.hpp"
 
 namespace oneapi::dal::preview::connected_components::backend {
@@ -90,9 +90,9 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
                                    const std::int64_t &samples_count = 1024) {
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
-    dal::backend::primitives::engine eng;
-    dal::backend::primitives::rng<std::int32_t> rn_gen;
-    rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_state(), 0, vertex_count);
+    dal::backend::primitives::daal_engine eng;
+    dal::backend::primitives::daal_rng<std::int32_t> rn_gen;
+    rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_cpu_engine_state(), 0, vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index d21de8c9627..b016a5bf6e9 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "oneapi/dal/backend/memory.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 
 namespace oneapi::dal::preview::louvain::backend {
 using namespace oneapi::dal::preview::detail;
@@ -123,8 +123,8 @@ struct louvain_data {
     // Total link weight in the network
     value_type m;
 
-    engine eng;
-    rng<std::int32_t> rn_gen;
+    daal_engine<engine_list_cpu::mt2203> eng;
+    daal_rng<std::int32_t> rn_gen;
 
     const std::int64_t vertex_count;
     const std::int64_t edge_count;
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index 79e294e9f47..7b277d88283 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_state(), 0, t._vertex_count);
+    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_cpu_engine_state(), 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index fabe919b34e..d673470b042 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -25,7 +25,7 @@
 #include "oneapi/dal/table/csr_accessor.hpp"
 #include "oneapi/dal/detail/debug.hpp"
 
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 
 namespace oneapi::dal::backend::primitives::test {
 
@@ -572,13 +572,13 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
         const std::int64_t p = hessian_host.get_dimension(0) - 1;
         const std::int64_t dim = fit_intercept ? p + 1 : p;
 
-        primitives::rng<float_t> rn_gen;
+        primitives::daal_rng<float_t> rn_gen;
         auto vec_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host);
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
-            primitives::engine eng(2007 + dim * num_checks + ij);
-            rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            primitives::daal_engine eng(2007 + dim * num_checks + ij);
+            rn_gen.uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index e902dd452e1..e2a611c2c98 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test<Param> {
         std::int64_t num_checks = 5;
 
         std::vector<ndarray<float_t, 1>> vecs_host(num_checks), vecs_gpu(num_checks);
-        rng<float_t> rn_gen;
+        daal_rng<float_t> rn_gen;
         for (std::int64_t ij = 0; ij < num_checks; ++ij) {
-            engine eng(2007 + dim * num_checks + ij);
+            daal_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
                 (ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host));
-            rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
             vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue());
         }
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index ea320f690a2..36e20f03c11 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -20,7 +20,7 @@
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include <math.h>
 
 namespace oneapi::dal::backend::primitives::test {
@@ -43,9 +43,9 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         x_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
-        primitives::rng<float_t> rn_gen;
-        primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, x_host_.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+        primitives::daal_rng<float_t> rn_gen;
+        primitives::daal_engine eng(4014 + n_);
+        rn_gen.uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index a6b87b2dcc1..777c0ee68e2 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -21,7 +21,7 @@
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/backend/primitives/blas/gemv.hpp"
 #include "oneapi/dal/backend/primitives/element_wise.hpp"
 
@@ -133,11 +133,11 @@ void create_stable_matrix(sycl::queue& queue,
     ONEDAL_ASSERT(A.get_dimension(1) == n);
     auto J = ndarray<Float, 2>::empty(queue, { n, n }, sycl::usm::alloc::host);
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
-    primitives::rng<Float> rn_gen;
-    primitives::engine eng(2007 + n);
+    primitives::daal_rng<Float> rn_gen;
+    primitives::daal_engine eng(2007 + n);
 
-    rn_gen.uniform(n * n, J.get_mutable_data(), eng.get_state(), -1.0, 1.0);
-    rn_gen.uniform(n, eigen_values.get_mutable_data(), eng.get_state(), bottom_eig, top_eig);
+    rn_gen.uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
+    rn_gen.uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
 
     // orthogonalize matrix J
     gram_schmidt(J);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index f473dddf1f7..d4f5ea55fb9 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -22,7 +22,7 @@
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include <math.h>
 
 #include "oneapi/dal/backend/primitives/objective_function.hpp"
@@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<std::int32_t, 1>::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host);
         auto params_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
-        primitives::rng<float_t> rn_gen;
-        primitives::engine eng(2007 + n);
-        rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng.get_state(), -10.0, 10.0);
-        rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng.get_state(), -5.0, 5.0);
+        primitives::daal_rng<float_t> rn_gen;
+        primitives::daal_engine eng(2007 + n);
+        rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
@@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         primitives::rng<float_t> rn_gen;
         primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+        rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
new file mode 100644
index 00000000000..a89ca3d4505
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -0,0 +1,25 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp"
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+#include "oneapi/dal/backend/primitives/rng/rng_gpu.hpp"
+
+#endif
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
new file mode 100644
index 00000000000..a692070551e
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <daal/include/algorithms/engines/mt2203/mt2203.h>
+#include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mt19937/mt19937.h>
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+namespace oneapi::dal::backend::primitives {
+
+enum class engine_list_cpu { mt2203, mcg59, mt19937 };
+
+template <engine_list_cpu EngineType = engine_list_cpu::mt2203>
+class daal_engine {
+public:
+    explicit daal_engine(std::int64_t seed = 777)
+            : daal_engine_(initialize_daal_engine(seed)),
+              impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
+                  daal_engine_.get())) {
+        if (!impl_) {
+            throw std::domain_error("RNG engine is not supported");
+        }
+    }
+
+    virtual ~daal_engine() = default;
+
+    void* get_cpu_engine_state() const {
+        return impl_->getState();
+    }
+
+    auto& get_cpu_engine() {
+        return daal_engine_;
+    }
+
+private:
+    daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
+        switch (EngineType) {
+            case engine_list_cpu::mt2203:
+                return daal::algorithms::engines::mt2203::Batch<>::create(seed);
+            case engine_list_cpu::mcg59:
+                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_list_cpu::mt19937:
+                return daal::algorithms::engines::mt19937::Batch<>::create(seed);
+            default: throw std::invalid_argument("Unsupported engine type");
+        }
+    }
+
+    daal::algorithms::engines::EnginePtr daal_engine_;
+    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
+};
+
+template <typename Type, typename Size = std::int64_t>
+class daal_rng {
+public:
+    daal_rng() = default;
+    ~daal_rng() = default;
+
+    void uniform(Size count, Type* dst, void* state, Type a, Type b) {
+        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+    }
+
+    void uniform_without_replacement_cpu(Size count,
+                                         Type* dst,
+                                         Type* buffer,
+                                         void* state,
+                                         Type a,
+                                         Type b) {
+        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
+                                                                     dst,
+                                                                     buffer,
+                                                                     state,
+                                                                     a,
+                                                                     b);
+    }
+
+    template <typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle(Size count, Type* dst, void* state) {
+        Type idx[2];
+
+        for (Size i = 0; i < count; ++i) {
+            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+            std::swap(dst[idx[0]], dst[idx[1]]);
+        }
+    }
+};
+
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
new file mode 100644
index 00000000000..79b5418d9d8
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -0,0 +1,172 @@
+/*******************************************************************************
+* Copyright 2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <oneapi/mkl.hpp>
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+
+namespace oneapi::dal::backend::primitives {
+
+namespace bk = oneapi::dal::backend;
+
+template <typename Type, typename Size>
+template <engine_list EngineType>
+void oneapi_rng<Type, Size>::uniform_gpu(sycl::queue& queue,
+                                         Size count,
+                                         Type* dst,
+                                         oneapi_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b,
+                                         const event_vector& deps) {
+    oneapi::mkl::rng::uniform<Type> distr(a, b);
+    auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps });
+    event.wait_and_throw();
+    engine_.skip_ahead_cpu(count);
+}
+
+template <typename Type, typename Size>
+template <engine_list EngineType>
+void oneapi_rng<Type, Size>::uniform_cpu(Size count,
+                                         Type* dst,
+                                         oneapi_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b) {
+    void* state = engine_.get_cpu_engine_state();
+    engine_.skip_ahead_gpu(count);
+    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+}
+
+template <typename Type, typename Size>
+template <engine_list EngineType>
+void oneapi_rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
+                                                             Size count,
+                                                             Type* dst,
+                                                             Type* buffer,
+                                                             oneapi_engine<EngineType>& engine_,
+                                                             Type a,
+                                                             Type b,
+                                                             const event_vector& deps) {
+    void* state = engine_.get_cpu_engine_state();
+    engine_.skip_ahead_gpu(count);
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+}
+
+template <typename Type, typename Size>
+template <engine_list EngineType>
+void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
+                                         Size count,
+                                         Type* dst,
+                                         oneapi_engine<EngineType>& engine_,
+                                         const event_vector& deps) {
+    Type idx[2];
+
+    void* state = engine_.get_cpu_engine_state();
+    engine_.skip_ahead_gpu(count);
+
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
+    }
+}
+
+#define INSTANTIATE_(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_gpu( \
+        sycl::queue& queue,                                       \
+        Size count_,                                              \
+        F* dst,                                                   \
+        oneapi_engine<EngineType>& engine_,                       \
+        F a,                                                      \
+        F b,                                                      \
+        const event_vector& deps);
+
+#define INSTANTIATE_FLOAT_(Size)                     \
+    INSTANTIATE_(float, Size, engine_list::mt2203)   \
+    INSTANTIATE_(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_(float, Size, engine_list::mt19937)  \
+    INSTANTIATE_(double, Size, engine_list::mt2203)  \
+    INSTANTIATE_(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_(double, Size, engine_list::mt19937) \
+    INSTANTIATE_(int, Size, engine_list::mt2203)     \
+    INSTANTIATE_(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_(int, Size, engine_list::mt19937)
+
+INSTANTIATE_FLOAT_(std::int64_t);
+INSTANTIATE_FLOAT_(std::int32_t);
+
+#define INSTANTIATE_CPU(F, Size, EngineType)                      \
+    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_cpu( \
+        Size count_,                                              \
+        F* dst,                                                   \
+        oneapi_engine<EngineType>& engine_,                       \
+        F a,                                                      \
+        F b);
+
+#define INSTANTIATE_FLOAT_CPU(Size)                     \
+    INSTANTIATE_CPU(float, Size, engine_list::mt2203)   \
+    INSTANTIATE_CPU(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_CPU(float, Size, engine_list::mt19937)  \
+    INSTANTIATE_CPU(double, Size, engine_list::mt2203)  \
+    INSTANTIATE_CPU(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_CPU(double, Size, engine_list::mt19937) \
+    INSTANTIATE_CPU(int, Size, engine_list::mt2203)     \
+    INSTANTIATE_CPU(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_CPU(int, Size, engine_list::mt19937)
+
+INSTANTIATE_FLOAT_CPU(std::int64_t);
+INSTANTIATE_FLOAT_CPU(std::int32_t);
+
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)              \
+    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_without_replacement_gpu( \
+        sycl::queue& queue,                                                           \
+        Size count_,                                                                  \
+        F* dst,                                                                       \
+        F* buff,                                                                      \
+        oneapi_engine<EngineType>& engine_,                                           \
+        F a,                                                                          \
+        F b,                                                                          \
+        const event_vector& deps);
+
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                     \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)   \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)  \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)  \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)     \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
+
+INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
+INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
+
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                  \
+    template ONEDAL_EXPORT void oneapi_rng<F, Size>::shuffle_gpu( \
+        sycl::queue& queue,                                       \
+        Size count_,                                              \
+        F* dst,                                                   \
+        oneapi_engine<EngineType>& engine_,                       \
+        const event_vector& deps);
+
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                 \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)  \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
+
+INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
+INSTANTIATE_SHUFFLE_FLOAT(std::int32_t);
+
+} // namespace oneapi::dal::backend::primitives
\ No newline at end of file
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 09a5a589141..81ce6bf852b 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -16,78 +16,41 @@
 
 #pragma once
 
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
-
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include <vector>
 
+#include <daal/include/algorithms/engines/mt2203/mt2203.h>
+#include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mt19937/mt19937.h>
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include "oneapi/dal/table/common.hpp"
+
 namespace oneapi::dal::backend::primitives {
 
-template <typename Size = std::int64_t>
+#ifdef ONEDAL_DATA_PARALLEL
+
+template <typename Size = std::int64_t, engine_list EngineType = engine_list::mt2203>
 class engine_collection {
 public:
-    explicit engine_collection(Size count, std::int64_t seed = 777)
+    engine_collection(sycl::queue& queue, Size count, std::int64_t seed = 777)
             : count_(count),
-              engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)),
-              params_(count),
-              technique_(daal::algorithms::engines::internal::family),
-              daal_engine_list_(count) {}
-
-    template <typename Op>
-    std::vector<engine> operator()(Op&& op) {
-        daal::services::Status status;
-        for (Size i = 0; i < count_; ++i) {
-            op(i, params_.nSkip[i]);
-        }
-        select_parallelization_technique(technique_);
-        daal::algorithms::engines::internal::EnginesCollection<daal::sse2> engine_collection(
-            engine_,
-            technique_,
-            params_,
-            daal_engine_list_,
-            &status);
-        if (!status) {
-            dal::backend::interop::status_to_exception(status);
-        }
-
-        std::vector<engine> engine_list(count_);
+              seed_(seed) {
+        engines_.reserve(count_);
         for (Size i = 0; i < count_; ++i) {
-            engine_list[i] = daal_engine_list_[i];
+            engines_.push_back(oneapi_engine<EngineType>(queue, seed_));
         }
-
-        //copy elision
-        return engine_list;
     }
 
-private:
-    void select_parallelization_technique(
-        daal::algorithms::engines::internal::ParallelizationTechnique& technique) {
-        auto daal_engine_impl =
-            dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
-
-        daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = {
-            daal::algorithms::engines::internal::family,
-            daal::algorithms::engines::internal::leapfrog,
-            daal::algorithms::engines::internal::skipahead
-        };
-
-        for (auto& techn : techniques) {
-            if (daal_engine_impl->hasSupport(techn)) {
-                technique = techn;
-                return;
-            }
-        }
-
-        throw domain_error(
-            dal::detail::error_messages::rng_engine_does_not_support_parallelization_techniques());
+    std::vector<oneapi_engine<EngineType>> get_engines() const {
+        return engines_;
     }
 
 private:
     Size count_;
-    daal::algorithms::engines::EnginePtr engine_;
-    daal::algorithms::engines::internal::Params<daal::sse2> params_;
-    daal::algorithms::engines::internal::ParallelizationTechnique technique_;
-    daal::services::internal::TArray<daal::algorithms::engines::EnginePtr, daal::sse2>
-        daal_engine_list_;
+    std::int64_t seed_;
+    std::vector<oneapi_engine<EngineType>> engines_;
 };
 
+#endif
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
new file mode 100644
index 00000000000..6463534caad
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
@@ -0,0 +1,220 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <daal/include/algorithms/engines/mt2203/mt2203.h>
+#include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mt19937/mt19937.h>
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include <oneapi/mkl.hpp>
+namespace mkl = oneapi::mkl;
+namespace oneapi::dal::backend::primitives {
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+enum class engine_list { mt2203, mcg59, mt19937 };
+
+template <engine_list EngineType>
+struct oneapi_engine_type;
+
+template <>
+struct oneapi_engine_type<engine_list::mt2203> {
+    using type = oneapi::mkl::rng::mt2203;
+};
+
+template <>
+struct oneapi_engine_type<engine_list::mcg59> {
+    using type = oneapi::mkl::rng::mcg59;
+};
+
+template <>
+struct oneapi_engine_type<engine_list::mt19937> {
+    using type = oneapi::mkl::rng::mt19937;
+};
+
+template <engine_list EngineType = engine_list::mt2203>
+class oneapi_engine {
+public:
+    using onedal_engine_t = typename oneapi_engine_type<EngineType>::type;
+
+    explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777)
+            : q(queue),
+              daal_engine_(initialize_daal_engine(seed)),
+              onedal_engine_(initialize_oneapi_engine(queue, seed)),
+              impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
+                  daal_engine_.get())) {
+        if (!impl_) {
+            throw std::domain_error("RNG engine is not supported");
+        }
+    }
+
+    virtual ~oneapi_engine() = default;
+
+    void* get_cpu_engine_state() const {
+        return impl_->getState();
+    }
+
+    auto& get_cpu_engine() {
+        return daal_engine_;
+    }
+
+    auto& get_gpu_engine() {
+        return onedal_engine_;
+    }
+
+    void skip_ahead_cpu(size_t nSkip) {
+        daal_engine_->skipAhead(nSkip);
+    }
+
+    void skip_ahead_gpu(size_t nSkip) {
+        if constexpr (EngineType == engine_list::mt2203) {
+        }
+        else {
+            skip_ahead(onedal_engine_, nSkip);
+        }
+    }
+
+private:
+    daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
+        switch (EngineType) {
+            case engine_list::mt2203:
+                return daal::algorithms::engines::mt2203::Batch<>::create(seed);
+            case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_list::mt19937:
+                return daal::algorithms::engines::mt19937::Batch<>::create(seed);
+            default: throw std::invalid_argument("Unsupported engine type");
+        }
+    }
+
+    onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) {
+        if constexpr (EngineType == engine_list::mt2203) {
+            return onedal_engine_t(queue, seed,
+                                   0); // Aligns CPU and GPU results for mt2203
+        }
+        else {
+            return onedal_engine_t(queue, seed);
+        }
+    }
+    sycl::queue q;
+    daal::algorithms::engines::EnginePtr daal_engine_;
+    onedal_engine_t onedal_engine_;
+    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
+};
+
+template <typename Type, typename Size = std::int64_t>
+class oneapi_rng {
+public:
+    oneapi_rng() = default;
+    ~oneapi_rng() = default;
+
+    template <engine_list EngineType>
+    void uniform(sycl::queue& queue,
+                 Size count,
+                 Type* dst,
+                 oneapi_engine<EngineType>& engine_,
+                 Type a,
+                 Type b,
+                 bool distr_mode = false,
+                 const event_vector& deps = {});
+
+    template <engine_list EngineType>
+    void uniform_gpu(sycl::queue& queue,
+                     Size count,
+                     Type* dst,
+                     oneapi_engine<EngineType>& engine_,
+                     Type a,
+                     Type b,
+                     const event_vector& deps = {});
+
+    template <engine_list EngineType>
+    void uniform_cpu(Size count, Type* dst, oneapi_engine<EngineType>& engine_, Type a, Type b);
+    template <engine_list EngineType>
+    void uniform_without_replacement(sycl::queue& queue,
+                                     Size count,
+                                     Type* dst,
+                                     oneapi_engine<EngineType>& engine_,
+                                     Type a,
+                                     Type b,
+                                     const event_vector& deps = {}) {}
+
+    template <engine_list EngineType>
+    void uniform_without_replacement_gpu(sycl::queue& queue,
+                                         Size count,
+                                         Type* dst,
+                                         Type* buff,
+                                         oneapi_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b,
+                                         const event_vector& deps = {});
+
+    template <engine_list EngineType>
+    void uniform_without_replacement_cpu(Size count,
+                                         Type* dst,
+                                         Type* buffer,
+                                         oneapi_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b) {
+        void* state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
+                                                                     dst,
+                                                                     buffer,
+                                                                     state,
+                                                                     a,
+                                                                     b);
+    }
+
+    template <engine_list EngineType,
+              typename T = Type,
+              typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle(Size count, Type* dst, oneapi_engine<EngineType>& engine_) {
+        Type idx[2];
+
+        void* state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+
+        for (Size i = 0; i < count; ++i) {
+            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+            std::swap(dst[idx[0]], dst[idx[1]]);
+        }
+    }
+
+    template <engine_list EngineType>
+    void shuffle_gpu(sycl::queue& queue,
+                     Size count,
+                     Type* dst,
+                     oneapi_engine<EngineType>& engine_,
+                     const event_vector& deps);
+
+    template <engine_list EngineType,
+              typename T = Type,
+              typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle_cpu(Size count, Type* dst, oneapi_engine<EngineType>& engine_) {
+        Type idx[2];
+
+        void* state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+
+        for (Size i = 0; i < count; ++i) {
+            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+            std::swap(dst[idx[0]], dst[idx[1]]);
+        }
+    }
+};
+
+#endif
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
new file mode 100644
index 00000000000..8a69f109162
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -0,0 +1,300 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/test/engine/common.hpp"
+#include "oneapi/dal/test/engine/fixtures.hpp"
+#include "oneapi/dal/test/engine/dataframe.hpp"
+
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp"
+namespace oneapi::dal::backend::primitives::test {
+
+namespace te = dal::test::engine;
+
+class mt2203 {};
+class mcg59 {};
+class mt19937 {};
+
+template <typename engine_type>
+struct engine_map {};
+
+template <>
+struct engine_map<mt2203> {
+    constexpr static auto value = engine_list::mt2203;
+};
+
+template <>
+struct engine_map<mcg59> {
+    constexpr static auto value = engine_list::mcg59;
+};
+
+template <>
+struct engine_map<mt19937> {
+    constexpr static auto value = engine_list::mt19937;
+};
+
+template <typename engine_type>
+constexpr auto engine_v = engine_map<engine_type>::value;
+
+template <typename TestType>
+class rng_test : public te::policy_fixture {
+public:
+    using Index = std::tuple_element_t<0, TestType>;
+    using EngineType = std::tuple_element_t<1, TestType>;
+    static constexpr auto engine_qq = engine_v<EngineType>;
+
+    auto get_rng() const {
+        oneapi_rng<Index> rn_gen;
+        return rn_gen;
+    }
+
+    auto get_engine(std::int64_t seed) {
+        auto rng_engine = oneapi_engine<engine_qq>(this->get_queue(), seed);
+        return rng_engine;
+    }
+
+    auto allocate_arrays(std::int64_t elem_count) {
+        auto& q = this->get_queue();
+        auto val_gpu = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
+        auto val_host = ndarray<Index, 1>::empty({ elem_count });
+
+        return std::make_tuple(val_gpu, val_host);
+    }
+
+    auto allocate_arrays_shared(std::int64_t elem_count) {
+        auto& q = this->get_queue();
+        auto val_gpu = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::shared);
+        auto val_host = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::shared);
+
+        return std::make_tuple(val_gpu, val_host);
+    }
+
+    auto allocate_arrays_device(std::int64_t elem_count) {
+        auto& q = this->get_queue();
+        auto val_gpu_1 = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
+        auto val_gpu_2 = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
+
+        return std::make_tuple(val_gpu_1, val_gpu_2);
+    }
+
+    auto allocate_arrays_host(std::int64_t elem_count) {
+        auto val_host_1 = ndarray<Index, 1>::empty({ elem_count });
+        auto val_host_2 = ndarray<Index, 1>::empty({ elem_count });
+
+        return std::make_tuple(val_host_1, val_host_2);
+    }
+
+    void check_results_host(const ndarray<Index, 1>& val_host_1,
+                            const ndarray<Index, 1>& val_host_2) {
+        const Index* val_host_1_ptr = val_host_1.get_data();
+
+        const Index* val_host_2_ptr = val_host_2.get_data();
+
+        for (std::int64_t el = 0; el < val_host_1.get_count(); el++) {
+            REQUIRE(val_host_1_ptr[el] == val_host_2_ptr[el]);
+        }
+    }
+
+    void check_results_device(const ndarray<Index, 1>& val_gpu_1,
+                              const ndarray<Index, 1>& val_gpu_2) {
+        const auto val_gpu_host_1 = val_gpu_1.to_host(this->get_queue());
+        const Index* val_gpu_host_1_ptr = val_gpu_host_1.get_data();
+
+        const auto val_gpu_host_2 = val_gpu_2.to_host(this->get_queue());
+        const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data();
+
+        for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) {
+            REQUIRE(val_gpu_host_2_ptr[el] == val_gpu_host_1_ptr[el]);
+        }
+    }
+
+    void check_results(const ndarray<Index, 1>& val_gpu, const ndarray<Index, 1>& val_host) {
+        const Index* val_host_ptr = val_host.get_data();
+
+        const auto val_gpu_host = val_gpu.to_host(this->get_queue());
+        const Index* val_gpu_host_ptr = val_gpu_host.get_data();
+
+        for (std::int64_t el = 0; el < val_host.get_count(); el++) {
+            REQUIRE(val_gpu_host_ptr[el] == val_host_ptr[el]);
+        }
+    }
+};
+
+using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59));
+
+TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
+    SKIP_IF(this->get_policy().is_cpu());
+    std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000);
+    std::int64_t seed = GENERATE_COPY(777, 999);
+
+    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rn_gen = this->get_rng();
+    auto rng_engine = this->get_engine(seed);
+    auto rng_engine_ = this->get_engine(seed);
+
+    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+
+    this->check_results(arr_gpu, arr_host);
+}
+
+using rng_types_skip = COMBINE_TYPES((float), (mcg59));
+
+// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
+//     SKIP_IF(this->get_policy().is_cpu());
+//     std::int64_t elem_count =
+//         GENERATE_COPY(10, 1000, 300000, 15000, 1000000, 100000000, 6100000000, 1LL * 64 * 1000000);
+//     std::int64_t seed = GENERATE_COPY(777);
+
+//     auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+//     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+//     auto arr_host_ptr = arr_host.get_mutable_data();
+
+//     auto rn_gen = this->get_rng();
+//     auto rng_engine = this->get_engine(seed);
+//     auto rng_engine_ = this->get_engine(seed);
+
+//     BENCHMARK("Uniform dispatcher HOST arr" + std::to_string(elem_count)) {
+//         rn_gen.uniform(this->get_queue(), elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+//     };
+//     BENCHMARK("Uniform dispatcher GPU arr" + std::to_string(elem_count)) {
+//         rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+//     };
+
+//     auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count);
+//     auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data();
+//     auto arr_host_ptr_ = arr_host_.get_mutable_data();
+
+//     auto rn_gen_ = this->get_rng();
+//     auto rng_engine_1 = this->get_engine(seed);
+//     auto rng_engine_2 = this->get_engine(seed);
+//     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
+//         rn_gen_.uniform_gpu(this->get_queue(),
+//                                      elem_count,
+//                                      arr_gpu_ptr_,
+//                                      rng_engine_1,
+//                                      0,
+//                                      elem_count);
+//     };
+
+//     BENCHMARK("Uniform HOST arr" + std::to_string(elem_count)) {
+//         rn_gen_.uniform(elem_count, arr_host_ptr_, rng_engine_2, 0, elem_count);
+//     };
+// }
+
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) {
+    SKIP_IF(this->get_policy().is_cpu());
+    std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000);
+    std::int64_t seed = GENERATE_COPY(777, 999);
+
+    auto [arr_host_init_1, arr_host_init_2] = this->allocate_arrays_host(elem_count);
+    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data();
+    auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data();
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rn_gen = this->get_rng();
+    auto rng_engine = this->get_engine(seed);
+    auto rng_engine_2 = this->get_engine(seed);
+
+    rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
+    rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
+
+    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+
+    this->check_results_host(arr_host_init_1, arr_host_init_2);
+    this->check_results(arr_gpu, arr_host);
+}
+
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) {
+    SKIP_IF(this->get_policy().is_cpu());
+    std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
+    std::int64_t seed = GENERATE_COPY(1, 777, 999);
+
+    auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_device(elem_count);
+    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
+    auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rn_gen = this->get_rng();
+    auto rng_engine = this->get_engine(seed);
+    auto rng_engine_2 = this->get_engine(seed);
+
+    rn_gen.uniform_gpu(this->get_queue(),
+                       elem_count,
+                       arr_device_init_1_ptr,
+                       rng_engine,
+                       0,
+                       elem_count);
+    rn_gen.uniform_gpu(this->get_queue(),
+                       elem_count,
+                       arr_device_init_2_ptr,
+                       rng_engine_2,
+                       0,
+                       elem_count);
+
+    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+
+    this->check_results_device(arr_device_init_1, arr_device_init_2);
+    this->check_results(arr_gpu, arr_host);
+}
+
+// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) {
+//     SKIP_IF(this->get_policy().is_cpu());
+//     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
+//     std::int64_t seed = GENERATE_COPY(1, 777, 999);
+
+//     engine_collection<std::int64_t,engine_list::mcg59> collection(this->get_queue(), 2, seed);
+
+//     auto engine_arr = collection.get_engines();
+
+//     auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count);
+
+//     auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
+//     auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
+
+//     auto rn_gen = this->get_rng();
+
+//     rn_gen.uniform(this->get_queue(),
+//                    elem_count,
+//                    arr_device_init_1_ptr,
+//                    engine_arr[0],
+//                    0,
+//                    elem_count);
+
+//     rn_gen.uniform(this->get_queue(),
+//                    elem_count,
+//                    arr_device_init_2_ptr,
+//                    engine_arr[1],
+//                    0,
+//                    elem_count);
+
+//     // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count);
+//     // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count);
+
+//     //this->check_results_device(arr_device_init_1, arr_device_init_2);
+//     this->check_results(arr_device_init_1, arr_device_init_2);
+// }
+
+} // namespace oneapi::dal::backend::primitives::test

From 81d7dfe7100a714152fe3203d5c193796ed1a68f Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 5 Nov 2024 07:12:50 -0800
Subject: [PATCH 02/18] minor fixes

---
 .../backend/gpu/train_kernel_hist_impl.hpp    |  2 +-
 .../gpu/train_kernel_hist_impl_dpc.cpp        | 23 +++---
 .../dal/backend/primitives/rng/rng_cpu.hpp    | 17 +++++
 .../primitives/rng/rng_engine_collection.hpp  | 72 ++++++++++++++++++-
 4 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
index 6d1c4362309..84e1d8f620f 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -50,7 +50,7 @@ class train_kernel_hist_impl {
     using model_manager_t = train_model_manager<Float, Index, Task>;
     using train_context_t = train_context<Float, Index, Task>;
     using imp_data_t = impurity_data<Float, Index, Task>;
-    using rng_engine_t = pr::engine;
+    using rng_engine_t = pr::daal_engine<pr::engine_list_cpu::mt2203>;
     using rng_engine_list_t = std::vector<rng_engine_t>;
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index 9fac38d25b0..10197bf0c43 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -396,12 +396,12 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
         Index* const node_list_ptr = node_list_host.get_mutable_data();
 
         for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-            pr::rng<Index> rn_gen;
+            pr::daal_rng<Index> rn_gen;
             Index* gen_row_idx_global_ptr =
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
             rn_gen.uniform(ctx.selected_row_total_count_,
                            gen_row_idx_global_ptr,
-                           rng_engine_list[engine_offset + node_idx].get_state(),
+                           rng_engine_list[engine_offset + node_idx].get_cpu_engine_state(),
                            0,
                            ctx.row_total_count_);
 
@@ -483,15 +483,15 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_);
 
-    pr::rng<Index> rn_gen;
+    pr::daal_rng<Index> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
     if (ctx.selected_ftr_count_ != ctx.column_count_) {
         for (Index node = 0; node < node_count; ++node) {
-            rn_gen.uniform_without_replacement(
+            rn_gen.uniform_without_replacement_cpu(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
-                rng_engine_list[tree_map_ptr[node]].get_state(),
+                rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(),
                 0,
                 ctx.column_count_);
         }
@@ -524,7 +524,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_);
 
-    pr::rng<Float> rn_gen;
+    pr::daal_rng<Float> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
 
     // Create arrays for random generated bins
@@ -539,7 +539,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     for (Index node = 0; node < node_count; ++node) {
         rn_gen.uniform(ctx.selected_ftr_count_,
                        random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                       rng_engine_list[tree_map_ptr[node]].get_state(),
+                       rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(),
                        0.0f,
                        1.0f);
     }
@@ -1660,12 +1660,13 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
 
             const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1);
 
-            pr::rng<Index> rn_gen;
+            pr::daal_rng<Index> rn_gen;
 
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                rn_gen.shuffle(oob_row_count,
-                               permutation_ptr,
-                               engine_arr[built_tree_count + tree_idx_in_block].get_state());
+                rn_gen.shuffle(
+                    oob_row_count,
+                    permutation_ptr,
+                    engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state());
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
index a692070551e..7ea7ae9266d 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -40,6 +40,23 @@ class daal_engine {
         }
     }
 
+    explicit daal_engine(const daal::algorithms::engines::EnginePtr& eng) : daal_engine_(eng) {
+        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
+        if (!impl_) {
+            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
+        }
+    }
+
+    daal_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
+        daal_engine_ = eng;
+        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
+        if (!impl_) {
+            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
+        }
+
+        return *this;
+    }
+
     virtual ~daal_engine() = default;
 
     void* get_cpu_engine_state() const {
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 81ce6bf852b..1d058be6025 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -30,10 +30,78 @@ namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-template <typename Size = std::int64_t, engine_list EngineType = engine_list::mt2203>
+template <typename Size = std::int64_t>
 class engine_collection {
 public:
-    engine_collection(sycl::queue& queue, Size count, std::int64_t seed = 777)
+    explicit engine_collection(Size count, std::int64_t seed = 777)
+            : count_(count),
+              engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)),
+              params_(count),
+              technique_(daal::algorithms::engines::internal::family),
+              daal_engine_list_(count) {}
+
+    template <typename Op>
+    std::vector<daal_engine<engine_list_cpu::mt2203>> operator()(Op&& op) {
+        daal::services::Status status;
+        for (Size i = 0; i < count_; ++i) {
+            op(i, params_.nSkip[i]);
+        }
+        select_parallelization_technique(technique_);
+        daal::algorithms::engines::internal::EnginesCollection<daal::sse2> engine_collection(
+            engine_,
+            technique_,
+            params_,
+            daal_engine_list_,
+            &status);
+        if (!status) {
+            dal::backend::interop::status_to_exception(status);
+        }
+
+        std::vector<daal_engine<engine_list_cpu::mt2203>> engine_list(count_);
+        for (Size i = 0; i < count_; ++i) {
+            engine_list[i] = daal_engine_list_[i];
+        }
+
+        //copy elision
+        return engine_list;
+    }
+
+private:
+    void select_parallelization_technique(
+        daal::algorithms::engines::internal::ParallelizationTechnique& technique) {
+        auto daal_engine_impl =
+            dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
+
+        daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = {
+            daal::algorithms::engines::internal::family,
+            daal::algorithms::engines::internal::leapfrog,
+            daal::algorithms::engines::internal::skipahead
+        };
+
+        for (auto& techn : techniques) {
+            if (daal_engine_impl->hasSupport(techn)) {
+                technique = techn;
+                return;
+            }
+        }
+
+        throw domain_error(
+            dal::detail::error_messages::rng_engine_does_not_support_parallelization_techniques());
+    }
+
+private:
+    Size count_;
+    daal::algorithms::engines::EnginePtr engine_;
+    daal::algorithms::engines::internal::Params<daal::sse2> params_;
+    daal::algorithms::engines::internal::ParallelizationTechnique technique_;
+    daal::services::internal::TArray<daal::algorithms::engines::EnginePtr, daal::sse2>
+        daal_engine_list_;
+};
+
+template <typename Size = std::int64_t, engine_list EngineType = engine_list::mt2203>
+class engine_collection_oneapi {
+public:
+    engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777)
             : count_(count),
               seed_(seed) {
         engines_.reserve(count_);

From acb6e4cb237c7e0b4d6d9034a7267640679bc170 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Wed, 13 Nov 2024 04:40:47 -0800
Subject: [PATCH 03/18] adding mrg32k3a engine

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.h    | 182 ++++++++++++++++++
 .../engines/mrg32k3a/mrg32k3a_types.h         |  64 ++++++
 cpp/daal/include/daal.h                       |   2 +
 cpp/daal/include/daal_win.h                   |   2 +
 .../algorithms/engines/mrg32k3a/mrg32k3a.cpp  |  58 ++++++
 .../mrg32k3a/mrg32k3a_batch_container.h       |  68 +++++++
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    | 116 +++++++++++
 .../mrg32k3a_dense_default_batch_fpt_cpu.cpp  |  47 +++++
 ...k3a_dense_default_batch_fpt_dispatcher.cpp |  30 +++
 .../engines/mrg32k3a/mrg32k3a_impl.i          |  49 +++++
 .../engines/mrg32k3a/mrg32k3a_kernel.h        |  58 ++++++
 .../algorithms/engines/mt2203/mt2203_kernel.h |   4 +-
 cpp/daal/src/externals/service_rng_mkl.h      |   1 +
 cpp/daal/src/externals/service_rng_openrng.h  |   1 +
 cpp/daal/src/externals/service_rng_ref.h      |   3 +-
 .../dal/backend/primitives/rng/rng_cpu.hpp    |   5 +-
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  10 +
 .../primitives/rng/rng_engine_collection.hpp  |   1 +
 .../dal/backend/primitives/rng/rng_gpu.hpp    |   9 +-
 .../backend/primitives/rng/test/rng_dpc.cpp   |  10 +-
 docs/source/daal/algorithms/engines/index.rst |   1 +
 .../daal/algorithms/engines/mrg32k3a.rst      |  63 ++++++
 makefile.lst                                  |   4 +-
 23 files changed, 780 insertions(+), 8 deletions(-)
 create mode 100644 cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
 create mode 100644 cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
 create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
 create mode 100644 docs/source/daal/algorithms/engines/mrg32k3a.rst

diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
new file mode 100644
index 00000000000..df6c1edf414
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -0,0 +1,182 @@
+/* file: mrg32k3a.h */
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the Mersenne Twister engine in the batch processing mode
+//--
+*/
+
+#ifndef __MRG32K3A_H__
+#define __MRG32K3A_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/engine.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+/**
+ * @defgroup engines_mrg32k3a_batch Batch
+ * @ingroup engines_mrg32k3a
+ * @{
+ */
+namespace interface1
+{
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__mrg32k3a__BATCHCONTAINER"></a>
+ * \brief Provides methods to run implementations of the mrg32k3a engine.
+ *        This class is associated with the \ref mrg32k3a::interface1::Batch "mrg32k3a::Batch" class
+ *        and supports the method of mrg32k3a engine computation in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of mrg32k3a engine, double or float
+ * \tparam method           Computation method of the engine, mrg32k3a::Method
+ * \tparam cpu              Version of the cpu-specific implementation of the engine, daal::CpuType
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class BatchContainer : public daal::algorithms::AnalysisContainerIface<batch>
+{
+public:
+    /**
+     * Constructs a container for the mrg32k3a engine with a specified environment
+     * in the batch processing mode
+     * \param[in] daalEnv   Environment object
+     */
+    BatchContainer(daal::services::Environment::env * daalEnv);
+    ~BatchContainer();
+    /**
+     * Computes the result of the mrg32k3a engine in the batch processing mode
+     *
+     * \return Status of computations
+     */
+    services::Status compute() DAAL_C11_OVERRIDE;
+};
+
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__mrg32k3a__BATCH"></a>
+ * \brief Provides methods for mrg32k3a engine computations in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of mrg32k3a engine, double or float
+ * \tparam method           Computation method of the engine, mrg32k3a::Method
+ *
+ * \par Enumerations
+ *      - mrg32k3a::Method          Computation methods for the mrg32k3a engine
+ *
+ * \par References
+ *      - \ref engines::interface1::Input  "engines::Input" class
+ *      - \ref engines::interface1::Result "engines::Result" class
+ */
+template <typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class DAAL_EXPORT Batch : public engines::BatchBase
+{
+public:
+    typedef engines::BatchBase super;
+
+    typedef typename super::InputType InputType;
+    typedef typename super::ResultType ResultType;
+
+    /**
+     * Creates mrg32k3a engine
+     * \param[in] seed  Initial condition for mrg32k3a engine
+     *
+     * \return Pointer to mrg32k3a engine
+     */
+    static services::SharedPtr<Batch<algorithmFPType, method> > create(size_t seed = 777);
+
+    /**
+     * Returns method of the engine
+     * \return Method of the engine
+     */
+    virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; }
+
+    /**
+     * Returns the structure that contains results of mrg32k3a engine
+     * \return Structure that contains results of mrg32k3a engine
+     */
+    ResultPtr getResult() { return _result; }
+
+    /**
+     * Registers user-allocated memory to store results of mrg32k3a engine
+     * \param[in] result  Structure to store results of mrg32k3a engine
+     *
+     * \return Status of computations
+     */
+    services::Status setResult(const ResultPtr & result)
+    {
+        DAAL_CHECK(result, services::ErrorNullResult)
+        _result = result;
+        _res    = _result.get();
+        return services::Status();
+    }
+
+    /**
+     * Returns a pointer to the newly allocated mrg32k3a engine
+     * with a copy of input objects and parameters of this mrg32k3a engine
+     * \return Pointer to the newly allocated engine
+     */
+    services::SharedPtr<Batch<algorithmFPType, method> > clone() const { return services::SharedPtr<Batch<algorithmFPType, method> >(cloneImpl()); }
+
+    /**
+     * Allocates memory to store the result of the mrg32k3a engine
+     *
+     * \return Status of computations
+     */
+    virtual services::Status allocateResult() DAAL_C11_OVERRIDE
+    {
+        services::Status s = this->_result->template allocate<algorithmFPType>(&(this->input), NULL, (int)method);
+        this->_res         = this->_result.get();
+        return s;
+    }
+
+protected:
+    Batch(size_t seed = 777) { initialize(); }
+
+    Batch(const Batch<algorithmFPType, method> & other) : super(other) { initialize(); }
+
+    virtual Batch<algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch<algorithmFPType, method>(*this); }
+
+    void initialize()
+    {
+        Analysis<batch>::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env);
+        _in                  = &input;
+        _result.reset(new ResultType());
+    }
+
+private:
+    ResultPtr _result;
+
+    Batch & operator=(const Batch &);
+};
+typedef services::SharedPtr<Batch<> > mrg32k3aPtr;
+typedef services::SharedPtr<const Batch<> > mrg32k3aConstPtr;
+
+} // namespace interface1
+using interface1::BatchContainer;
+using interface1::Batch;
+using interface1::mrg32k3aPtr;
+using interface1::mrg32k3aConstPtr;
+/** @} */
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+#endif
diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
new file mode 100644
index 00000000000..77ca9656418
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
@@ -0,0 +1,64 @@
+/* file: mrg32k3a_types.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of mrg32k3a engine.
+//--
+*/
+
+#ifndef __MRG32K3A_TYPES_H__
+#define __MRG32K3A_TYPES_H__
+
+#include "algorithms/algorithm.h"
+#include "services/daal_defines.h"
+#include "data_management/data/numeric_table.h"
+#include "data_management/data/homogen_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+/**
+ * @defgroup engines_mrg32k3a mrg32k3a Engine
+ * \copydoc daal::algorithms::engines::mrg32k3a
+ * @ingroup engines
+ * @{
+ */
+/**
+ * \brief Contains classes for mrg32k3a engine
+ */
+namespace mrg32k3a
+{
+/**
+ * <a name="DAAL-ENUM-ALGORITHMS__ENGINES__mrg32k3a__METHOD"></a>
+ * Available methods to compute mrg32k3a engine
+ */
+enum Method
+{
+    defaultDense = 0 /*!< Default: performance-oriented method. */
+};
+
+} // namespace mrg32k3a
+/** @} */
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h
index 881a6c39fbe..443f237e051 100755
--- a/cpp/daal/include/daal.h
+++ b/cpp/daal/include/daal.h
@@ -301,6 +301,8 @@
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
 #include "algorithms/engines/mcg59/mcg59_types.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
 #include "algorithms/engines/engine_family.h"
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "algorithms/engines/mt2203/mt2203_types.h"
diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h
index e17eff16796..3e64c38660f 100755
--- a/cpp/daal/include/daal_win.h
+++ b/cpp/daal/include/daal_win.h
@@ -313,6 +313,8 @@
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
 #include "algorithms/engines/mcg59/mcg59_types.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
 #include "algorithms/engines/engine_family.h"
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "algorithms/engines/mt2203/mt2203_types.h"
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
new file mode 100644
index 00000000000..288cb0506ee
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
@@ -0,0 +1,58 @@
+/* file: mrg32k3a.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of mrg32k3a engine
+//--
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/externals/service_dispatch.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+using namespace daal::services;
+using namespace mrg32k3a::internal;
+
+template <typename algorithmFPType, Method method>
+SharedPtr<Batch<algorithmFPType, method> > Batch<algorithmFPType, method>::create(size_t seed)
+{
+    SharedPtr<Batch<algorithmFPType, method> > engPtr;
+#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl<cpuId, algorithmFPType, method>(__VA_ARGS__));
+
+    DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed);
+
+#undef DAAL_CREATE_ENGINE_CPU
+    return engPtr;
+}
+
+template class Batch<double, defaultDense>;
+template class Batch<float, defaultDense>;
+
+} // namespace interface1
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
new file mode 100644
index 00000000000..1fb8f9ca991
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
@@ -0,0 +1,68 @@
+/* file: mrg32k3a_batch_container.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of mrg32k3a calculation algorithm container.
+//--
+*/
+
+#ifndef __mrg32k3a_BATCH_CONTAINER_H__
+#define __mrg32k3a_BATCH_CONTAINER_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface<batch>(daalEnv)
+{
+    __DAAL_INITIALIZE_KERNELS(internal::mrg32k3aKernel, algorithmFPType, method);
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::~BatchContainer()
+{
+    __DAAL_DEINITIALIZE_KERNELS();
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+services::Status BatchContainer<algorithmFPType, method, cpu>::compute()
+{
+    daal::services::Environment::env & env = *_env;
+    engines::Result * result               = static_cast<engines::Result *>(_res);
+    NumericTable * resultTable             = result->get(engines::randomNumbers).get();
+
+    __DAAL_CALL_KERNEL(env, internal::mrg32k3aKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable);
+}
+
+} // namespace interface1
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
new file mode 100644
index 00000000000..07dc07b9b3a
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -0,0 +1,116 @@
+/* file: mrg32k3a_batch_impl.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the class defining the mrg32k3a engine
+//--
+*/
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/engines/engine_batch_impl.h"
+#include "src/externals/service_rng.h"
+#include "src/data_management/service_numeric_table.h"
+
+static const int leapfrogMethodErrcode  = -1002;
+static const int skipAheadMethodErrcode = -1003;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method>, public algorithms::engines::internal::BatchBaseImpl
+{
+public:
+    typedef algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method> super1;
+    typedef algorithms::engines::internal::BatchBaseImpl super2;
+    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_mrg32k3a), super2(seed) {}
+
+    void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); }
+
+    int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); }
+
+    services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.leapfrog(threadNum, nThreads);
+        services::Status s;
+        if (errcode == leapfrogMethodErrcode)
+            s.add(ErrorLeapfrogUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.skipAhead(nSkip);
+        services::Status s;
+        if (errcode == skipAheadMethodErrcode)
+            s.add(ErrorSkipAheadUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    virtual BatchImpl<cpu, algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE
+    {
+        return new BatchImpl<cpu, algorithmFPType, method>(*this);
+    }
+
+    bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE
+    {
+        switch (technique)
+        {
+        case engines::internal::family: return false;
+        case engines::internal::skipahead: return true;
+        case engines::internal::leapfrog: return true;
+        }
+        return false;
+    }
+
+    ~BatchImpl() {}
+
+protected:
+    BatchImpl(const BatchImpl<cpu, algorithmFPType, method> & other) : super1(other), super2(other), baseRng(other.baseRng) {}
+
+    daal::internal::BaseRNGsInst<cpu> baseRng;
+};
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
new file mode 100644
index 00000000000..2af52dd0443
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
@@ -0,0 +1,47 @@
+/* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of mrg32k3a calculation functions.
+//--
+
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace interface1
+
+namespace internal
+{
+template class mrg32k3aKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace internal
+
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
new file mode 100644
index 00000000000..482486e243f
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
@@ -0,0 +1,30 @@
+/* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of mrg32k3a calculation algorithm dispatcher.
+//--
+
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h"
+
+namespace daal
+{
+namespace algorithms
+{
+__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::mrg32k3a::BatchContainer, batch, DAAL_FPTYPE, engines::mrg32k3a::defaultDense)
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
new file mode 100644
index 00000000000..5e359ecaaa3
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
@@ -0,0 +1,49 @@
+/* file: mrg32k3a_impl.i */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of mrg32k3a algorithm
+//--
+*/
+
+#ifndef __mrg32k3a_IMPL_I__
+#define __mrg32k3a_IMPL_I__
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+Status mrg32k3aKernel<algorithmFPType, method, cpu>::compute(NumericTable * resultTensor)
+{
+    return Status();
+}
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
new file mode 100644
index 00000000000..3959576ccbe
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
@@ -0,0 +1,58 @@
+/* file: mrg32k3a_kernel.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Declaration of template function that calculate mrg32k3as.
+//--
+
+#ifndef __mrg32k3a_KERNEL_H__
+#define __mrg32k3a_KERNEL_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/kernel.h"
+#include "data_management/data/numeric_table.h"
+
+using namespace daal::services;
+using namespace daal::data_management;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+/**
+ *  \brief Kernel for mrg32k3a calculation
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class mrg32k3aKernel : public Kernel
+{
+public:
+    Status compute(NumericTable * resultTable);
+};
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
index b7de119367f..e588a02c8fb 100644
--- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
+++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
@@ -19,8 +19,8 @@
 //  Declaration of template function that calculate mt2203s.
 //--
 
-#ifndef __MCG59_KERNEL_H__
-#define __MCG59_KERNEL_H__
+#ifndef __MT2203_KERNEL_H__
+#define __MT2203_KERNEL_H__
 
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h
index b2dcd81b78b..a911b2e5d8d 100644
--- a/cpp/daal/src/externals/service_rng_mkl.h
+++ b/cpp/daal/src/externals/service_rng_mkl.h
@@ -32,6 +32,7 @@
 #define __DAAL_BRNG_MT2203                    VSL_BRNG_MT2203
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
+#define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h
index dd70c644606..96c567b7366 100644
--- a/cpp/daal/src/externals/service_rng_openrng.h
+++ b/cpp/daal/src/externals/service_rng_openrng.h
@@ -25,6 +25,7 @@
 #define __DAAL_BRNG_MT2203                    VSL_BRNG_MT2203
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
+#define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index fc56fcf6205..7eafa70fb43 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -39,7 +39,8 @@
     #define __DAAL_BRNG_MT2203  (1 << 20) * 9 //VSL_BRNG_MT2203
     #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937
     #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
-
+    //tmp
+    #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 4 //VSL_BRNG_MRG32K3A
     #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
     #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
     #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
index 7ea7ae9266d..b9488da808b 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -19,6 +19,7 @@
 #include <daal/include/algorithms/engines/mt2203/mt2203.h>
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
+#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include <stdexcept>
 #include <type_traits>
@@ -26,7 +27,7 @@
 #include "oneapi/dal/backend/primitives/rng/rng.hpp"
 namespace oneapi::dal::backend::primitives {
 
-enum class engine_list_cpu { mt2203, mcg59, mt19937 };
+enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a };
 
 template <engine_list_cpu EngineType = engine_list_cpu::mt2203>
 class daal_engine {
@@ -74,6 +75,8 @@ class daal_engine {
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
             case engine_list_cpu::mcg59:
                 return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_list_cpu::mrg32k3a:
+                return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
             case engine_list_cpu::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 79b5418d9d8..82ff48edab9 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -95,12 +95,15 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
 #define INSTANTIATE_FLOAT_(Size)                     \
     INSTANTIATE_(float, Size, engine_list::mt2203)   \
     INSTANTIATE_(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_(float, Size, engine_list::mrg32k3a)    \
     INSTANTIATE_(float, Size, engine_list::mt19937)  \
     INSTANTIATE_(double, Size, engine_list::mt2203)  \
     INSTANTIATE_(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_(double, Size, engine_list::mrg32k3a)   \
     INSTANTIATE_(double, Size, engine_list::mt19937) \
     INSTANTIATE_(int, Size, engine_list::mt2203)     \
     INSTANTIATE_(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_(int, Size, engine_list::mrg32k3a)      \
     INSTANTIATE_(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_(std::int64_t);
@@ -117,12 +120,15 @@ INSTANTIATE_FLOAT_(std::int32_t);
 #define INSTANTIATE_FLOAT_CPU(Size)                     \
     INSTANTIATE_CPU(float, Size, engine_list::mt2203)   \
     INSTANTIATE_CPU(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a)    \
     INSTANTIATE_CPU(float, Size, engine_list::mt19937)  \
     INSTANTIATE_CPU(double, Size, engine_list::mt2203)  \
     INSTANTIATE_CPU(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a)   \
     INSTANTIATE_CPU(double, Size, engine_list::mt19937) \
     INSTANTIATE_CPU(int, Size, engine_list::mt2203)     \
     INSTANTIATE_CPU(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a)      \
     INSTANTIATE_CPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_CPU(std::int64_t);
@@ -142,12 +148,15 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
 #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                     \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)   \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)   \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)     \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)      \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
@@ -164,6 +173,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
 #define INSTANTIATE_SHUFFLE_FLOAT(Size)                 \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)  \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a)  \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
 
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 1d058be6025..dd7bffd68d7 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -22,6 +22,7 @@
 
 #include <daal/include/algorithms/engines/mt2203/mt2203.h>
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include "oneapi/dal/table/common.hpp"
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
index 6463534caad..9cf27a2f4ee 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
@@ -18,6 +18,7 @@
 
 #include <daal/include/algorithms/engines/mt2203/mt2203.h>
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include <oneapi/mkl.hpp>
@@ -26,7 +27,7 @@ namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-enum class engine_list { mt2203, mcg59, mt19937 };
+enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a};
 
 template <engine_list EngineType>
 struct oneapi_engine_type;
@@ -46,6 +47,11 @@ struct oneapi_engine_type<engine_list::mt19937> {
     using type = oneapi::mkl::rng::mt19937;
 };
 
+template <>
+struct oneapi_engine_type<engine_list::mrg32k3a> {
+    using type = oneapi::mkl::rng::mrg32k3a;
+};
+
 template <engine_list EngineType = engine_list::mt2203>
 class oneapi_engine {
 public:
@@ -94,6 +100,7 @@ class oneapi_engine {
             case engine_list::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
             case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
             case engine_list::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 8a69f109162..6219b8c32fe 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -26,6 +26,7 @@ namespace te = dal::test::engine;
 
 class mt2203 {};
 class mcg59 {};
+class mrg32k3a {};
 class mt19937 {};
 
 template <typename engine_type>
@@ -41,6 +42,11 @@ struct engine_map<mcg59> {
     constexpr static auto value = engine_list::mcg59;
 };
 
+template <>
+struct engine_map<mrg32k3a> {
+    constexpr static auto value = engine_list::mrg32k3a;
+};
+
 template <>
 struct engine_map<mt19937> {
     constexpr static auto value = engine_list::mt19937;
@@ -133,7 +139,7 @@ class rng_test : public te::policy_fixture {
     }
 };
 
-using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59));
+using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a));
 
 TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     SKIP_IF(this->get_policy().is_cpu());
@@ -154,7 +160,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip = COMBINE_TYPES((float), (mcg59));
+using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a));
 
 // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst
index e73aef6d991..1c476178dc9 100644
--- a/docs/source/daal/algorithms/engines/index.rst
+++ b/docs/source/daal/algorithms/engines/index.rst
@@ -113,4 +113,5 @@ These methods are represented with member functions of classes that represent fu
 
     mt19937.rst
     mcg59.rst
+    mrg32k3a.rst
     mt2203.rst
diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst
new file mode 100644
index 00000000000..ce8ca0ec0cc
--- /dev/null
+++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst
@@ -0,0 +1,63 @@
+.. ******************************************************************************
+.. * Copyright 2020 Intel Corporation
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+mrg32k3a
+========
+
+The engine is based on the 59-bit multiplicative congruential generator.
+
+.. rubric:: Subsequence selection methods support
+
+skipAhead (nskip)
+    Supported
+leapfrog (threadIdx, nThreads)
+    Supported
+
+Batch Processing
+****************
+
+mrg32k3a engine needs the initial condition (``seed``) for state initialization.
+The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors.
+
+.. rubric:: Algorithm Parameters
+
+mrg32k3a engine has the following parameters:
+
+.. tabularcolumns::  |\Y{0.2}|\Y{0.2}|\Y{0.6}|
+
+.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing)
+   :header-rows: 1
+   :widths: 10 20 30
+   :align: left
+   :class: longtable
+
+   * - Parameter
+     - Default Value
+     - Description
+   * - ``algorithmFPType``
+     - ``float``
+     - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``.
+   * - ``method``
+     - ``defaultDense``
+     - Performance-oriented computation method; the only method supported by the algorithm.
+   * - ``seed``
+     -
+       - :math:`777` for a scalar seed
+       - NA for a vector seed
+     - Initial condition for state initialization, scalar or vector:
+
+       - Scalar, value of ``size_t`` type
+       - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p`
diff --git a/makefile.lst b/makefile.lst
index 92dc52ff521..db26829caef 100755
--- a/makefile.lst
+++ b/makefile.lst
@@ -65,7 +65,7 @@ multiclassclassifier += classifier
 k_nearest_neighbors += engines classifier
 logistic_regression += classifier optimization_solver objective_function engines
 implicit_als += engines distributions
-engines += engines/mt19937 engines/mcg59 engines/mt2203
+engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/mt2203
 distributions += distributions/bernoulli distributions/normal distributions/uniform
 tsne +=
 
@@ -95,6 +95,7 @@ CORE.ALGORITHMS.FULL :=                                                       \
     elastic_net                                                               \
     engines                                                                   \
     engines/mcg59                                                             \
+    engines/mrg32k3a                                                          \
     engines/mt19937                                                           \
     engines/mt2203                                                            \
     em                                                                        \
@@ -309,6 +310,7 @@ JJ.ALGORITHMS       := adaboost
                        elastic_net/prediction                                    \
                        engines                                                   \
                        engines/mcg59                                             \
+                       engines/mrg32k3a                                          \
                        engines/mt19937                                           \
                        engines/mt2203                                            \
                        em_gmm                                                    \

From 58d98b093e61cf4dcff4a4f205603234a485870d Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Wed, 13 Nov 2024 07:22:26 -0800
Subject: [PATCH 04/18] fix fro mrg32k

---
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    |  2 +-
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 14 ++++-----
 .../backend/primitives/rng/test/rng_dpc.cpp   | 30 ++++---------------
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
index 07dc07b9b3a..bbe3cf2dcf9 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -45,7 +45,7 @@ class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algori
 public:
     typedef algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method> super1;
     typedef algorithms::engines::internal::BatchBaseImpl super2;
-    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_mrg32k3a), super2(seed) {}
+    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_MRG32K3A), super2(seed) {}
 
     void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); }
 
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 82ff48edab9..029fec3896c 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -148,15 +148,15 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
 #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                     \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)   \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)    \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)   \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)     \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)      \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
@@ -170,10 +170,10 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
         oneapi_engine<EngineType>& engine_,                       \
         const event_vector& deps);
 
-#define INSTANTIATE_SHUFFLE_FLOAT(Size)                 \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)  \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a)  \
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                   \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)   \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)    \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
 
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 6219b8c32fe..dd16eb3d3dc 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -110,7 +110,7 @@ class rng_test : public te::policy_fixture {
         const Index* val_host_2_ptr = val_host_2.get_data();
 
         for (std::int64_t el = 0; el < val_host_1.get_count(); el++) {
-            REQUIRE(val_host_1_ptr[el] == val_host_2_ptr[el]);
+            REQUIRE(abs(val_host_1_ptr[el] - val_host_2_ptr[el]) < 1);
         }
     }
 
@@ -123,7 +123,7 @@ class rng_test : public te::policy_fixture {
         const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data();
 
         for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) {
-            REQUIRE(val_gpu_host_2_ptr[el] == val_gpu_host_1_ptr[el]);
+            REQUIRE(abs(val_gpu_host_2_ptr[el] - val_gpu_host_1_ptr[el]) < 1);
         }
     }
 
@@ -134,7 +134,7 @@ class rng_test : public te::policy_fixture {
         const Index* val_gpu_host_ptr = val_gpu_host.get_data();
 
         for (std::int64_t el = 0; el < val_host.get_count(); el++) {
-            REQUIRE(val_gpu_host_ptr[el] == val_host_ptr[el]);
+            REQUIRE(abs(val_gpu_host_ptr[el] - val_host_ptr[el]) < 1);
         }
     }
 };
@@ -160,36 +160,21 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a));
+using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a));
 
 // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
 //     std::int64_t elem_count =
-//         GENERATE_COPY(10, 1000, 300000, 15000, 1000000, 100000000, 6100000000, 1LL * 64 * 1000000);
+//         GENERATE_COPY(6100000000, 1LL * 64 * 1000000);
 //     std::int64_t seed = GENERATE_COPY(777);
 
-//     auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
-//     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
-//     auto arr_host_ptr = arr_host.get_mutable_data();
-
-//     auto rn_gen = this->get_rng();
-//     auto rng_engine = this->get_engine(seed);
-//     auto rng_engine_ = this->get_engine(seed);
-
-//     BENCHMARK("Uniform dispatcher HOST arr" + std::to_string(elem_count)) {
-//         rn_gen.uniform(this->get_queue(), elem_count, arr_host_ptr, rng_engine, 0, elem_count);
-//     };
-//     BENCHMARK("Uniform dispatcher GPU arr" + std::to_string(elem_count)) {
-//         rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
-//     };
 
 //     auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count);
 //     auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data();
-//     auto arr_host_ptr_ = arr_host_.get_mutable_data();
 
 //     auto rn_gen_ = this->get_rng();
 //     auto rng_engine_1 = this->get_engine(seed);
-//     auto rng_engine_2 = this->get_engine(seed);
+
 //     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
 //         rn_gen_.uniform_gpu(this->get_queue(),
 //                                      elem_count,
@@ -199,9 +184,6 @@ using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a));
 //                                      elem_count);
 //     };
 
-//     BENCHMARK("Uniform HOST arr" + std::to_string(elem_count)) {
-//         rn_gen_.uniform(elem_count, arr_host_ptr_, rng_engine_2, 0, elem_count);
-//     };
 // }
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) {

From 67ed2f6cb77434291e7537cfcccf128405850578 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Thu, 14 Nov 2024 06:10:52 -0800
Subject: [PATCH 05/18] add philox

---
 .../engines/philox4x32x10/philox4x32x10.h     | 182 ++++++++++++++++++
 .../philox4x32x10/philox4x32x10_types.h       |  64 ++++++
 cpp/daal/include/daal.h                       |   2 +
 cpp/daal/include/daal_win.h                   |   2 +
 .../engines/philox4x32x10/philox4x32x10.cpp   |  58 ++++++
 .../philox4x32x10_batch_container.h           |  68 +++++++
 .../philox4x32x10/philox4x32x10_batch_impl.h  | 116 +++++++++++
 ...lox4x32x10_dense_default_batch_fpt_cpu.cpp |  47 +++++
 ...x10_dense_default_batch_fpt_dispatcher.cpp |  30 +++
 .../philox4x32x10/philox4x32x10_impl.i        |  49 +++++
 .../philox4x32x10/philox4x32x10_kernel.h      |  58 ++++++
 cpp/daal/src/externals/service_rng_mkl.h      |   1 +
 cpp/daal/src/externals/service_rng_openrng.h  |   1 +
 cpp/daal/src/externals/service_rng_ref.h      |   1 +
 .../dal/backend/primitives/rng/rng_cpu.hpp    |   5 +-
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  12 +-
 .../primitives/rng/rng_engine_collection.hpp  |   1 +
 .../dal/backend/primitives/rng/rng_gpu.hpp    |   9 +-
 .../backend/primitives/rng/test/rng_dpc.cpp   |  10 +-
 docs/source/daal/algorithms/engines/index.rst |   1 +
 makefile.lst                                  |   4 +-
 21 files changed, 715 insertions(+), 6 deletions(-)
 create mode 100644 cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
 create mode 100644 cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
 create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h

diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
new file mode 100644
index 00000000000..09eae5a7cd8
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
@@ -0,0 +1,182 @@
+/* file: philox4x32x10.h */
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the Mersenne Twister engine in the batch processing mode
+//--
+*/
+
+#ifndef __PHILOX4X32X10_H__
+#define __PHILOX4X32X10_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
+#include "algorithms/engines/engine.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+/**
+ * @defgroup engines_philox4x32x10_batch Batch
+ * @ingroup engines_philox4x32x10
+ * @{
+ */
+namespace interface1
+{
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__philox4x32x10__BATCHCONTAINER"></a>
+ * \brief Provides methods to run implementations of the philox4x32x10 engine.
+ *        This class is associated with the \ref philox4x32x10::interface1::Batch "philox4x32x10::Batch" class
+ *        and supports the method of philox4x32x10 engine computation in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of philox4x32x10 engine, double or float
+ * \tparam method           Computation method of the engine, philox4x32x10::Method
+ * \tparam cpu              Version of the cpu-specific implementation of the engine, daal::CpuType
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class BatchContainer : public daal::algorithms::AnalysisContainerIface<batch>
+{
+public:
+    /**
+     * Constructs a container for the philox4x32x10 engine with a specified environment
+     * in the batch processing mode
+     * \param[in] daalEnv   Environment object
+     */
+    BatchContainer(daal::services::Environment::env * daalEnv);
+    ~BatchContainer();
+    /**
+     * Computes the result of the philox4x32x10 engine in the batch processing mode
+     *
+     * \return Status of computations
+     */
+    services::Status compute() DAAL_C11_OVERRIDE;
+};
+
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__philox4x32x10__BATCH"></a>
+ * \brief Provides methods for philox4x32x10 engine computations in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of philox4x32x10 engine, double or float
+ * \tparam method           Computation method of the engine, philox4x32x10::Method
+ *
+ * \par Enumerations
+ *      - philox4x32x10::Method          Computation methods for the philox4x32x10 engine
+ *
+ * \par References
+ *      - \ref engines::interface1::Input  "engines::Input" class
+ *      - \ref engines::interface1::Result "engines::Result" class
+ */
+template <typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class DAAL_EXPORT Batch : public engines::BatchBase
+{
+public:
+    typedef engines::BatchBase super;
+
+    typedef typename super::InputType InputType;
+    typedef typename super::ResultType ResultType;
+
+    /**
+     * Creates philox4x32x10 engine
+     * \param[in] seed  Initial condition for philox4x32x10 engine
+     *
+     * \return Pointer to philox4x32x10 engine
+     */
+    static services::SharedPtr<Batch<algorithmFPType, method> > create(size_t seed = 777);
+
+    /**
+     * Returns method of the engine
+     * \return Method of the engine
+     */
+    virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; }
+
+    /**
+     * Returns the structure that contains results of philox4x32x10 engine
+     * \return Structure that contains results of philox4x32x10 engine
+     */
+    ResultPtr getResult() { return _result; }
+
+    /**
+     * Registers user-allocated memory to store results of philox4x32x10 engine
+     * \param[in] result  Structure to store results of philox4x32x10 engine
+     *
+     * \return Status of computations
+     */
+    services::Status setResult(const ResultPtr & result)
+    {
+        DAAL_CHECK(result, services::ErrorNullResult)
+        _result = result;
+        _res    = _result.get();
+        return services::Status();
+    }
+
+    /**
+     * Returns a pointer to the newly allocated philox4x32x10 engine
+     * with a copy of input objects and parameters of this philox4x32x10 engine
+     * \return Pointer to the newly allocated engine
+     */
+    services::SharedPtr<Batch<algorithmFPType, method> > clone() const { return services::SharedPtr<Batch<algorithmFPType, method> >(cloneImpl()); }
+
+    /**
+     * Allocates memory to store the result of the philox4x32x10 engine
+     *
+     * \return Status of computations
+     */
+    virtual services::Status allocateResult() DAAL_C11_OVERRIDE
+    {
+        services::Status s = this->_result->template allocate<algorithmFPType>(&(this->input), NULL, (int)method);
+        this->_res         = this->_result.get();
+        return s;
+    }
+
+protected:
+    Batch(size_t seed = 777) { initialize(); }
+
+    Batch(const Batch<algorithmFPType, method> & other) : super(other) { initialize(); }
+
+    virtual Batch<algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch<algorithmFPType, method>(*this); }
+
+    void initialize()
+    {
+        Analysis<batch>::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env);
+        _in                  = &input;
+        _result.reset(new ResultType());
+    }
+
+private:
+    ResultPtr _result;
+
+    Batch & operator=(const Batch &);
+};
+typedef services::SharedPtr<Batch<> > philox4x32x10Ptr;
+typedef services::SharedPtr<const Batch<> > philox4x32x10ConstPtr;
+
+} // namespace interface1
+using interface1::BatchContainer;
+using interface1::Batch;
+using interface1::philox4x32x10Ptr;
+using interface1::philox4x32x10ConstPtr;
+/** @} */
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+#endif
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
new file mode 100644
index 00000000000..d3da7ff32a9
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
@@ -0,0 +1,64 @@
+/* file: philox4x32x10_types.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of philox4x32x10 engine.
+//--
+*/
+
+#ifndef __PHILOX4X32X10_TYPES_H__
+#define __PHILOX4X32X10_TYPES_H__
+
+#include "algorithms/algorithm.h"
+#include "services/daal_defines.h"
+#include "data_management/data/numeric_table.h"
+#include "data_management/data/homogen_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+/**
+ * @defgroup engines_philox4x32x10 philox4x32x10 Engine
+ * \copydoc daal::algorithms::engines::philox4x32x10
+ * @ingroup engines
+ * @{
+ */
+/**
+ * \brief Contains classes for philox4x32x10 engine
+ */
+namespace philox4x32x10
+{
+/**
+ * <a name="DAAL-ENUM-ALGORITHMS__ENGINES__philox4x32x10__METHOD"></a>
+ * Available methods to compute philox4x32x10 engine
+ */
+enum Method
+{
+    defaultDense = 0 /*!< Default: performance-oriented method. */
+};
+
+} // namespace philox4x32x10
+/** @} */
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h
index 443f237e051..375d9c0b3b8 100755
--- a/cpp/daal/include/daal.h
+++ b/cpp/daal/include/daal.h
@@ -303,6 +303,8 @@
 #include "algorithms/engines/mcg59/mcg59_types.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
 #include "algorithms/engines/engine_family.h"
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "algorithms/engines/mt2203/mt2203_types.h"
diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h
index 3e64c38660f..87b1155a0a4 100755
--- a/cpp/daal/include/daal_win.h
+++ b/cpp/daal/include/daal_win.h
@@ -315,6 +315,8 @@
 #include "algorithms/engines/mcg59/mcg59_types.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
 #include "algorithms/engines/engine_family.h"
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "algorithms/engines/mt2203/mt2203_types.h"
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
new file mode 100644
index 00000000000..78b1014663a
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
@@ -0,0 +1,58 @@
+/* file: philox4x32x10.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of philox4x32x10 engine
+//--
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/externals/service_dispatch.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+using namespace daal::services;
+using namespace philox4x32x10::internal;
+
+template <typename algorithmFPType, Method method>
+SharedPtr<Batch<algorithmFPType, method> > Batch<algorithmFPType, method>::create(size_t seed)
+{
+    SharedPtr<Batch<algorithmFPType, method> > engPtr;
+#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl<cpuId, algorithmFPType, method>(__VA_ARGS__));
+
+    DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed);
+
+#undef DAAL_CREATE_ENGINE_CPU
+    return engPtr;
+}
+
+template class Batch<double, defaultDense>;
+template class Batch<float, defaultDense>;
+
+} // namespace interface1
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
new file mode 100644
index 00000000000..fcffa11e0d7
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
@@ -0,0 +1,68 @@
+/* file: philox4x32x10_batch_container.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of philox4x32x10 calculation algorithm container.
+//--
+*/
+
+#ifndef __philox4x32x10_BATCH_CONTAINER_H__
+#define __philox4x32x10_BATCH_CONTAINER_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface<batch>(daalEnv)
+{
+    __DAAL_INITIALIZE_KERNELS(internal::philox4x32x10Kernel, algorithmFPType, method);
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::~BatchContainer()
+{
+    __DAAL_DEINITIALIZE_KERNELS();
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+services::Status BatchContainer<algorithmFPType, method, cpu>::compute()
+{
+    daal::services::Environment::env & env = *_env;
+    engines::Result * result               = static_cast<engines::Result *>(_res);
+    NumericTable * resultTable             = result->get(engines::randomNumbers).get();
+
+    __DAAL_CALL_KERNEL(env, internal::philox4x32x10Kernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable);
+}
+
+} // namespace interface1
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
new file mode 100644
index 00000000000..8495fb3b883
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -0,0 +1,116 @@
+/* file: philox4x32x10_batch_impl.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the class defining the philox4x32x10 engine
+//--
+*/
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/engines/engine_batch_impl.h"
+#include "src/externals/service_rng.h"
+#include "src/data_management/service_numeric_table.h"
+
+static const int leapfrogMethodErrcode  = -1002;
+static const int skipAheadMethodErrcode = -1003;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method>, public algorithms::engines::internal::BatchBaseImpl
+{
+public:
+    typedef algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method> super1;
+    typedef algorithms::engines::internal::BatchBaseImpl super2;
+    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_PHILOX4X32X10), super2(seed) {}
+
+    void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); }
+
+    int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); }
+
+    services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.leapfrog(threadNum, nThreads);
+        services::Status s;
+        if (errcode == leapfrogMethodErrcode)
+            s.add(ErrorLeapfrogUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.skipAhead(nSkip);
+        services::Status s;
+        if (errcode == skipAheadMethodErrcode)
+            s.add(ErrorSkipAheadUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    virtual BatchImpl<cpu, algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE
+    {
+        return new BatchImpl<cpu, algorithmFPType, method>(*this);
+    }
+
+    bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE
+    {
+        switch (technique)
+        {
+        case engines::internal::family: return false;
+        case engines::internal::skipahead: return true;
+        case engines::internal::leapfrog: return true;
+        }
+        return false;
+    }
+
+    ~BatchImpl() {}
+
+protected:
+    BatchImpl(const BatchImpl<cpu, algorithmFPType, method> & other) : super1(other), super2(other), baseRng(other.baseRng) {}
+
+    daal::internal::BaseRNGsInst<cpu> baseRng;
+};
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
new file mode 100644
index 00000000000..712bd3f7300
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
@@ -0,0 +1,47 @@
+/* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of philox4x32x10 calculation functions.
+//--
+
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace interface1
+
+namespace internal
+{
+template class philox4x32x10Kernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace internal
+
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
new file mode 100644
index 00000000000..225d9f02da1
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
@@ -0,0 +1,30 @@
+/* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of philox4x32x10 calculation algorithm dispatcher.
+//--
+
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h"
+
+namespace daal
+{
+namespace algorithms
+{
+__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::philox4x32x10::BatchContainer, batch, DAAL_FPTYPE, engines::philox4x32x10::defaultDense)
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
new file mode 100644
index 00000000000..6c113d179c8
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
@@ -0,0 +1,49 @@
+/* file: philox4x32x10_impl.i */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of philox4x32x10 algorithm
+//--
+*/
+
+#ifndef __philox4x32x10_IMPL_I__
+#define __philox4x32x10_IMPL_I__
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+Status philox4x32x10Kernel<algorithmFPType, method, cpu>::compute(NumericTable * resultTensor)
+{
+    return Status();
+}
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
new file mode 100644
index 00000000000..28b689a9ab8
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
@@ -0,0 +1,58 @@
+/* file: philox4x32x10_kernel.h */
+/*******************************************************************************
+* Copyright 2014 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Declaration of template function that calculate philox4x32x10s.
+//--
+
+#ifndef __philox4x32x10_KERNEL_H__
+#define __philox4x32x10_KERNEL_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/kernel.h"
+#include "data_management/data/numeric_table.h"
+
+using namespace daal::services;
+using namespace daal::data_management;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+/**
+ *  \brief Kernel for philox4x32x10 calculation
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class philox4x32x10Kernel : public Kernel
+{
+public:
+    Status compute(NumericTable * resultTable);
+};
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h
index a911b2e5d8d..83edae913e2 100644
--- a/cpp/daal/src/externals/service_rng_mkl.h
+++ b/cpp/daal/src/externals/service_rng_mkl.h
@@ -33,6 +33,7 @@
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
 #define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
+#define __DAAL_BRNG_PHILOX4X32X10             VSL_BRNG_PHILOX4X32X10
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h
index 96c567b7366..3d1b9833a52 100644
--- a/cpp/daal/src/externals/service_rng_openrng.h
+++ b/cpp/daal/src/externals/service_rng_openrng.h
@@ -26,6 +26,7 @@
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
 #define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
+#define __DAAL_BRNG_PHILOX4X32X10             VSL_BRNG_PHILOX4X32X10
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index 7eafa70fb43..3890d0bdf2b 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -41,6 +41,7 @@
     #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
     //tmp
     #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 4 //VSL_BRNG_MRG32K3A
+    #define __DAAL_BRNG_PHILOX4X32X10  (1 << 20) * 4 //VSL_BRNG_MRG32K3A
     #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
     #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
     #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
index b9488da808b..6b517b73c17 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -20,6 +20,7 @@
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
+#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include <stdexcept>
 #include <type_traits>
@@ -27,7 +28,7 @@
 #include "oneapi/dal/backend/primitives/rng/rng.hpp"
 namespace oneapi::dal::backend::primitives {
 
-enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a };
+enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
 
 template <engine_list_cpu EngineType = engine_list_cpu::mt2203>
 class daal_engine {
@@ -77,6 +78,8 @@ class daal_engine {
                 return daal::algorithms::engines::mcg59::Batch<>::create(seed);
             case engine_list_cpu::mrg32k3a:
                 return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
+            case engine_list_cpu::philox4x32x10:
+                return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
             case engine_list_cpu::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 029fec3896c..7e8a69eba98 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -95,15 +95,18 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
 #define INSTANTIATE_FLOAT_(Size)                     \
     INSTANTIATE_(float, Size, engine_list::mt2203)   \
     INSTANTIATE_(float, Size, engine_list::mcg59)    \
-    INSTANTIATE_(float, Size, engine_list::mrg32k3a)    \
+    INSTANTIATE_(float, Size, engine_list::mrg32k3a) \
+    INSTANTIATE_(float, Size, engine_list::philox4x32x10) \
     INSTANTIATE_(float, Size, engine_list::mt19937)  \
     INSTANTIATE_(double, Size, engine_list::mt2203)  \
     INSTANTIATE_(double, Size, engine_list::mcg59)   \
     INSTANTIATE_(double, Size, engine_list::mrg32k3a)   \
+    INSTANTIATE_(double, Size, engine_list::philox4x32x10)   \
     INSTANTIATE_(double, Size, engine_list::mt19937) \
     INSTANTIATE_(int, Size, engine_list::mt2203)     \
     INSTANTIATE_(int, Size, engine_list::mcg59)      \
     INSTANTIATE_(int, Size, engine_list::mrg32k3a)      \
+    INSTANTIATE_(int, Size, engine_list::philox4x32x10)      \
     INSTANTIATE_(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_(std::int64_t);
@@ -121,14 +124,17 @@ INSTANTIATE_FLOAT_(std::int32_t);
     INSTANTIATE_CPU(float, Size, engine_list::mt2203)   \
     INSTANTIATE_CPU(float, Size, engine_list::mcg59)    \
     INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a)    \
+    INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10)    \
     INSTANTIATE_CPU(float, Size, engine_list::mt19937)  \
     INSTANTIATE_CPU(double, Size, engine_list::mt2203)  \
     INSTANTIATE_CPU(double, Size, engine_list::mcg59)   \
     INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a)   \
+    INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10)   \
     INSTANTIATE_CPU(double, Size, engine_list::mt19937) \
     INSTANTIATE_CPU(int, Size, engine_list::mt2203)     \
     INSTANTIATE_CPU(int, Size, engine_list::mcg59)      \
     INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a)      \
+    INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10)      \
     INSTANTIATE_CPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_CPU(std::int64_t);
@@ -149,14 +155,17 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)   \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)    \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)  \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)   \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)     \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)      \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
@@ -174,6 +183,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)   \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)    \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
 
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index dd7bffd68d7..1f0f5c65225 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -23,6 +23,7 @@
 #include <daal/include/algorithms/engines/mt2203/mt2203.h>
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
 #include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
+#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include "oneapi/dal/table/common.hpp"
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
index 9cf27a2f4ee..a90b66c49a7 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
@@ -19,6 +19,7 @@
 #include <daal/include/algorithms/engines/mt2203/mt2203.h>
 #include <daal/include/algorithms/engines/mcg59/mcg59.h>
 #include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
+#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include <oneapi/mkl.hpp>
@@ -27,7 +28,7 @@ namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a};
+enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10};
 
 template <engine_list EngineType>
 struct oneapi_engine_type;
@@ -52,6 +53,11 @@ struct oneapi_engine_type<engine_list::mrg32k3a> {
     using type = oneapi::mkl::rng::mrg32k3a;
 };
 
+template <>
+struct oneapi_engine_type<engine_list::philox4x32x10> {
+    using type = oneapi::mkl::rng::philox4x32x10;
+};
+
 template <engine_list EngineType = engine_list::mt2203>
 class oneapi_engine {
 public:
@@ -101,6 +107,7 @@ class oneapi_engine {
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
             case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
             case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
+            case engine_list::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
             case engine_list::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index dd16eb3d3dc..bdefc472a58 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -28,6 +28,7 @@ class mt2203 {};
 class mcg59 {};
 class mrg32k3a {};
 class mt19937 {};
+class philox4x32x10 {};
 
 template <typename engine_type>
 struct engine_map {};
@@ -47,6 +48,11 @@ struct engine_map<mrg32k3a> {
     constexpr static auto value = engine_list::mrg32k3a;
 };
 
+template <>
+struct engine_map<philox4x32x10> {
+    constexpr static auto value = engine_list::philox4x32x10;
+};
+
 template <>
 struct engine_map<mt19937> {
     constexpr static auto value = engine_list::mt19937;
@@ -139,7 +145,7 @@ class rng_test : public te::policy_fixture {
     }
 };
 
-using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a));
+using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
 
 TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     SKIP_IF(this->get_policy().is_cpu());
@@ -160,7 +166,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a));
+using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10));
 
 // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst
index 1c476178dc9..34113a6dd38 100644
--- a/docs/source/daal/algorithms/engines/index.rst
+++ b/docs/source/daal/algorithms/engines/index.rst
@@ -114,4 +114,5 @@ These methods are represented with member functions of classes that represent fu
     mt19937.rst
     mcg59.rst
     mrg32k3a.rst
+    philox4x32x10.rst
     mt2203.rst
diff --git a/makefile.lst b/makefile.lst
index db26829caef..b042ede80a7 100755
--- a/makefile.lst
+++ b/makefile.lst
@@ -65,7 +65,7 @@ multiclassclassifier += classifier
 k_nearest_neighbors += engines classifier
 logistic_regression += classifier optimization_solver objective_function engines
 implicit_als += engines distributions
-engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/mt2203
+engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/philox4x32x10 engines/mt2203
 distributions += distributions/bernoulli distributions/normal distributions/uniform
 tsne +=
 
@@ -96,6 +96,7 @@ CORE.ALGORITHMS.FULL :=                                                       \
     engines                                                                   \
     engines/mcg59                                                             \
     engines/mrg32k3a                                                          \
+    engines/philox4x32x10                                                     \
     engines/mt19937                                                           \
     engines/mt2203                                                            \
     em                                                                        \
@@ -311,6 +312,7 @@ JJ.ALGORITHMS       := adaboost
                        engines                                                   \
                        engines/mcg59                                             \
                        engines/mrg32k3a                                          \
+                       engines/philox4x32x10                                     \
                        engines/mt19937                                           \
                        engines/mt2203                                            \
                        em_gmm                                                    \

From 4941b236ff57f3922a80551118e7ffa0e2969cef Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Fri, 15 Nov 2024 02:20:22 -0800
Subject: [PATCH 06/18] a lot of fixes with rng

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.h    |   3 +-
 .../philox4x32x10/philox4x32x10_batch_impl.h  |   2 +-
 cpp/daal/src/externals/service_rng_ref.h      |   5 +-
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../backend/gpu/train_kernel_hist_impl.hpp    |   2 +-
 .../gpu/train_kernel_hist_impl_dpc.cpp        |   8 +-
 .../algo/louvain/backend/cpu/louvain_data.hpp |   2 +-
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../optimizers/test/newton_cg_dpc.cpp         |   4 +-
 .../dal/backend/primitives/rng/rng_cpu.hpp    |  34 +++---
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  16 +--
 .../dal/backend/primitives/rng/rng_engine.hpp | 101 -----------------
 .../primitives/rng/rng_engine_collection.hpp  |  16 +--
 .../dal/backend/primitives/rng/rng_gpu.hpp    |  54 +++++----
 .../dal/backend/primitives/rng/rng_types.hpp  |  27 +++++
 .../backend/primitives/rng/test/rng_dpc.cpp   | 104 +++++++-----------
 16 files changed, 134 insertions(+), 248 deletions(-)
 delete mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp
 create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp

diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
index df6c1edf414..c35eb6a9f09 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -17,7 +17,8 @@
 
 /*
 //++
-//  Implementation of the Mersenne Twister engine in the batch processing mode
+//  Implementation of the 32-bit combined multiple recursive generator with two components of order 3
+//  in the batch processing mode.
 //--
 */
 
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
index 8495fb3b883..45e7f759729 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -96,7 +96,7 @@ class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<a
         {
         case engines::internal::family: return false;
         case engines::internal::skipahead: return true;
-        case engines::internal::leapfrog: return true;
+        case engines::internal::leapfrog: return false;
         }
         return false;
     }
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index 3890d0bdf2b..0445614b3e3 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -39,9 +39,8 @@
     #define __DAAL_BRNG_MT2203  (1 << 20) * 9 //VSL_BRNG_MT2203
     #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937
     #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
-    //tmp
-    #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 4 //VSL_BRNG_MRG32K3A
-    #define __DAAL_BRNG_PHILOX4X32X10  (1 << 20) * 4 //VSL_BRNG_MRG32K3A
+    #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 3 //VSL_BRNG_MRG32K3A
+    #define __DAAL_BRNG_PHILOX4X32X10  (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
     #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
     #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
     #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index 218f7da46bc..55087df26af 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -92,7 +92,7 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
 
     dal::backend::primitives::daal_engine eng;
     dal::backend::primitives::daal_rng<std::int32_t> rn_gen;
-    rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_cpu_engine_state(), 0, vertex_count);
+    rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
index 84e1d8f620f..1db5e078773 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -50,7 +50,7 @@ class train_kernel_hist_impl {
     using model_manager_t = train_model_manager<Float, Index, Task>;
     using train_context_t = train_context<Float, Index, Task>;
     using imp_data_t = impurity_data<Float, Index, Task>;
-    using rng_engine_t = pr::daal_engine<pr::engine_list_cpu::mt2203>;
+    using rng_engine_t = pr::daal_engine<pr::engine_list::mt2203>;
     using rng_engine_list_t = std::vector<rng_engine_t>;
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index 10197bf0c43..42355b6caf5 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -401,7 +401,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
             rn_gen.uniform(ctx.selected_row_total_count_,
                            gen_row_idx_global_ptr,
-                           rng_engine_list[engine_offset + node_idx].get_cpu_engine_state(),
+                           rng_engine_list[engine_offset + node_idx],
                            0,
                            ctx.row_total_count_);
 
@@ -491,7 +491,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
-                rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(),
+                rng_engine_list[tree_map_ptr[node]],
                 0,
                 ctx.column_count_);
         }
@@ -539,7 +539,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     for (Index node = 0; node < node_count; ++node) {
         rn_gen.uniform(ctx.selected_ftr_count_,
                        random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                       rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(),
+                       rng_engine_list[tree_map_ptr[node]],
                        0.0f,
                        1.0f);
     }
@@ -1666,7 +1666,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
                 rn_gen.shuffle(
                     oob_row_count,
                     permutation_ptr,
-                    engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state());
+                    engine_arr[built_tree_count + tree_idx_in_block]);
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index b016a5bf6e9..d2751b3840b 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -123,7 +123,7 @@ struct louvain_data {
     // Total link weight in the network
     value_type m;
 
-    daal_engine<engine_list_cpu::mt2203> eng;
+    daal_engine<engine_list::mt2203> eng;
     daal_rng<std::int32_t> rn_gen;
 
     const std::int64_t vertex_count;
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index 7b277d88283..ff78f06f833 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_cpu_engine_state(), 0, t._vertex_count);
+    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index d4f5ea55fb9..b24a59386c7 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         primitives::rng<float_t> rn_gen;
         primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0);
+        rn_gen.uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0);
+            rn_gen.uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
index 6b517b73c17..2cb18c72c1f 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -16,21 +16,15 @@
 
 #pragma once
 
-#include <daal/include/algorithms/engines/mt2203/mt2203.h>
-#include <daal/include/algorithms/engines/mcg59/mcg59.h>
-#include <daal/include/algorithms/engines/mt19937/mt19937.h>
-#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
-#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
 #include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
 namespace oneapi::dal::backend::primitives {
 
-enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
-
-template <engine_list_cpu EngineType = engine_list_cpu::mt2203>
+template <engine_list EngineType = engine_list::mt2203>
 class daal_engine {
 public:
     explicit daal_engine(std::int64_t seed = 777)
@@ -72,15 +66,15 @@ class daal_engine {
 private:
     daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
         switch (EngineType) {
-            case engine_list_cpu::mt2203:
+            case engine_list::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
-            case engine_list_cpu::mcg59:
+            case engine_list::mcg59:
                 return daal::algorithms::engines::mcg59::Batch<>::create(seed);
-            case engine_list_cpu::mrg32k3a:
+            case engine_list::mrg32k3a:
                 return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
-            case engine_list_cpu::philox4x32x10:
+            case engine_list::philox4x32x10:
                 return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
-            case engine_list_cpu::mt19937:
+            case engine_list::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
         }
@@ -96,16 +90,20 @@ class daal_rng {
     daal_rng() = default;
     ~daal_rng() = default;
 
-    void uniform(Size count, Type* dst, void* state, Type a, Type b) {
+    template <engine_list EngineType>
+    void uniform(Size count, Type* dst, daal_engine<EngineType> daal_engine, Type a, Type b) {
+        auto state = daal_engine.get_cpu_engine_state();
         uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
     }
 
+    template <engine_list EngineType>
     void uniform_without_replacement_cpu(Size count,
                                          Type* dst,
                                          Type* buffer,
-                                         void* state,
+                                         daal_engine<EngineType> daal_engine,
                                          Type a,
                                          Type b) {
+        auto state = daal_engine.get_cpu_engine_state();
         uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
                                                                      dst,
                                                                      buffer,
@@ -114,10 +112,10 @@ class daal_rng {
                                                                      b);
     }
 
-    template <typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, void* state) {
+    template <engine_list EngineType, typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle(Size count, Type* dst, daal_engine<EngineType> daal_engine) {
         Type idx[2];
-
+        auto state = daal_engine.get_cpu_engine_state();
         for (Size i = 0; i < count; ++i) {
             uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
             std::swap(dst[idx[0]], dst[idx[1]]);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 7e8a69eba98..68a8eabaa0a 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -27,7 +27,7 @@ template <engine_list EngineType>
 void oneapi_rng<Type, Size>::uniform_gpu(sycl::queue& queue,
                                          Size count,
                                          Type* dst,
-                                         oneapi_engine<EngineType>& engine_,
+                                         onedal_engine<EngineType>& engine_,
                                          Type a,
                                          Type b,
                                          const event_vector& deps) {
@@ -41,7 +41,7 @@ template <typename Type, typename Size>
 template <engine_list EngineType>
 void oneapi_rng<Type, Size>::uniform_cpu(Size count,
                                          Type* dst,
-                                         oneapi_engine<EngineType>& engine_,
+                                         onedal_engine<EngineType>& engine_,
                                          Type a,
                                          Type b) {
     void* state = engine_.get_cpu_engine_state();
@@ -55,7 +55,7 @@ void oneapi_rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
                                                              Size count,
                                                              Type* dst,
                                                              Type* buffer,
-                                                             oneapi_engine<EngineType>& engine_,
+                                                             onedal_engine<EngineType>& engine_,
                                                              Type a,
                                                              Type b,
                                                              const event_vector& deps) {
@@ -69,7 +69,7 @@ template <engine_list EngineType>
 void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
                                          Size count,
                                          Type* dst,
-                                         oneapi_engine<EngineType>& engine_,
+                                         onedal_engine<EngineType>& engine_,
                                          const event_vector& deps) {
     Type idx[2];
 
@@ -87,7 +87,7 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
         sycl::queue& queue,                                       \
         Size count_,                                              \
         F* dst,                                                   \
-        oneapi_engine<EngineType>& engine_,                       \
+        onedal_engine<EngineType>& engine_,                       \
         F a,                                                      \
         F b,                                                      \
         const event_vector& deps);
@@ -116,7 +116,7 @@ INSTANTIATE_FLOAT_(std::int32_t);
     template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_cpu( \
         Size count_,                                              \
         F* dst,                                                   \
-        oneapi_engine<EngineType>& engine_,                       \
+        onedal_engine<EngineType>& engine_,                       \
         F a,                                                      \
         F b);
 
@@ -146,7 +146,7 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
         Size count_,                                                                  \
         F* dst,                                                                       \
         F* buff,                                                                      \
-        oneapi_engine<EngineType>& engine_,                                           \
+        onedal_engine<EngineType>& engine_,                                           \
         F a,                                                                          \
         F b,                                                                          \
         const event_vector& deps);
@@ -176,7 +176,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
         sycl::queue& queue,                                       \
         Size count_,                                              \
         F* dst,                                                   \
-        oneapi_engine<EngineType>& engine_,                       \
+        onedal_engine<EngineType>& engine_,                       \
         const event_vector& deps);
 
 #define INSTANTIATE_SHUFFLE_FLOAT(Size)                   \
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp
deleted file mode 100644
index c8ca3b13ce9..00000000000
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#pragma once
-
-#include <daal/include/algorithms/engines/mt2203/mt2203.h>
-
-#include "oneapi/dal/backend/primitives/rng/utils.hpp"
-
-namespace oneapi::dal::backend::primitives {
-
-template <typename Type, typename Size = std::int64_t>
-class rng {
-public:
-    rng() = default;
-    ~rng() = default;
-
-    void uniform(Size count, Type* dst, void* state, Type a, Type b) {
-        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
-    }
-
-    void uniform_without_replacement(Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     void* state,
-                                     Type a,
-                                     Type b) {
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
-    }
-
-    template <typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, void* state) {
-        Type idx[2];
-
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
-    }
-
-private:
-    daal::internal::RNGsInst<Type, DAAL_BASE_CPU> daal_rng_;
-};
-
-class engine {
-public:
-    explicit engine(std::int64_t seed = 777)
-            : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) {
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-    }
-
-    explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) {
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-    }
-
-    virtual ~engine() = default;
-
-    engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
-        engine_ = eng;
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-
-        return *this;
-    }
-
-    void* get_state() const {
-        return impl_->getState();
-    }
-
-private:
-    daal::algorithms::engines::EnginePtr engine_;
-    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
-};
-
-} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 1f0f5c65225..76c56f61f7c 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -20,12 +20,8 @@
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include <vector>
 
-#include <daal/include/algorithms/engines/mt2203/mt2203.h>
-#include <daal/include/algorithms/engines/mcg59/mcg59.h>
-#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
-#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
-#include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
 #include "oneapi/dal/table/common.hpp"
 
 namespace oneapi::dal::backend::primitives {
@@ -43,7 +39,7 @@ class engine_collection {
               daal_engine_list_(count) {}
 
     template <typename Op>
-    std::vector<daal_engine<engine_list_cpu::mt2203>> operator()(Op&& op) {
+    std::vector<daal_engine<engine_list::mt2203>> operator()(Op&& op) {
         daal::services::Status status;
         for (Size i = 0; i < count_; ++i) {
             op(i, params_.nSkip[i]);
@@ -59,7 +55,7 @@ class engine_collection {
             dal::backend::interop::status_to_exception(status);
         }
 
-        std::vector<daal_engine<engine_list_cpu::mt2203>> engine_list(count_);
+        std::vector<daal_engine<engine_list::mt2203>> engine_list(count_);
         for (Size i = 0; i < count_; ++i) {
             engine_list[i] = daal_engine_list_[i];
         }
@@ -108,18 +104,18 @@ class engine_collection_oneapi {
               seed_(seed) {
         engines_.reserve(count_);
         for (Size i = 0; i < count_; ++i) {
-            engines_.push_back(oneapi_engine<EngineType>(queue, seed_));
+            engines_.push_back(onedal_engine<EngineType>(queue, seed_));
         }
     }
 
-    std::vector<oneapi_engine<EngineType>> get_engines() const {
+    std::vector<onedal_engine<EngineType>> get_engines() const {
         return engines_;
     }
 
 private:
     Size count_;
     std::int64_t seed_;
-    std::vector<oneapi_engine<EngineType>> engines_;
+    std::vector<onedal_engine<EngineType>> engines_;
 };
 
 #endif
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
index a90b66c49a7..a68df41c541 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
@@ -16,57 +16,51 @@
 
 #pragma once
 
-#include <daal/include/algorithms/engines/mt2203/mt2203.h>
-#include <daal/include/algorithms/engines/mcg59/mcg59.h>
-#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
-#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
-#include <daal/include/algorithms/engines/mt19937/mt19937.h>
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
 #include <oneapi/mkl.hpp>
 namespace mkl = oneapi::mkl;
 namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10};
-
 template <engine_list EngineType>
-struct oneapi_engine_type;
+struct onedal_engine_type;
 
 template <>
-struct oneapi_engine_type<engine_list::mt2203> {
+struct onedal_engine_type<engine_list::mt2203> {
     using type = oneapi::mkl::rng::mt2203;
 };
 
 template <>
-struct oneapi_engine_type<engine_list::mcg59> {
+struct onedal_engine_type<engine_list::mcg59> {
     using type = oneapi::mkl::rng::mcg59;
 };
 
 template <>
-struct oneapi_engine_type<engine_list::mt19937> {
+struct onedal_engine_type<engine_list::mt19937> {
     using type = oneapi::mkl::rng::mt19937;
 };
 
 template <>
-struct oneapi_engine_type<engine_list::mrg32k3a> {
+struct onedal_engine_type<engine_list::mrg32k3a> {
     using type = oneapi::mkl::rng::mrg32k3a;
 };
 
 template <>
-struct oneapi_engine_type<engine_list::philox4x32x10> {
+struct onedal_engine_type<engine_list::philox4x32x10> {
     using type = oneapi::mkl::rng::philox4x32x10;
 };
 
 template <engine_list EngineType = engine_list::mt2203>
-class oneapi_engine {
+class onedal_engine {
 public:
-    using onedal_engine_t = typename oneapi_engine_type<EngineType>::type;
+    using onedal_engine_t = typename onedal_engine_type<EngineType>::type;
 
-    explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777)
+    explicit onedal_engine(sycl::queue& queue, std::int64_t seed = 777)
             : q(queue),
               daal_engine_(initialize_daal_engine(seed)),
-              onedal_engine_(initialize_oneapi_engine(queue, seed)),
+              onedal_engine_(initialize_onedal_engine(queue, seed)),
               impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
                   daal_engine_.get())) {
         if (!impl_) {
@@ -74,7 +68,7 @@ class oneapi_engine {
         }
     }
 
-    virtual ~oneapi_engine() = default;
+    virtual ~onedal_engine() = default;
 
     void* get_cpu_engine_state() const {
         return impl_->getState();
@@ -93,6 +87,7 @@ class oneapi_engine {
     }
 
     void skip_ahead_gpu(size_t nSkip) {
+        // Will be fixed in the next oneMKL release.
         if constexpr (EngineType == engine_list::mt2203) {
         }
         else {
@@ -114,10 +109,10 @@ class oneapi_engine {
         }
     }
 
-    onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) {
+    onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) {
         if constexpr (EngineType == engine_list::mt2203) {
             return onedal_engine_t(queue, seed,
-                                   0); // Aligns CPU and GPU results for mt2203
+                                   0); // Aligns CPU and GPU results for mt2203, impacts the performance.
         }
         else {
             return onedal_engine_t(queue, seed);
@@ -139,7 +134,7 @@ class oneapi_rng {
     void uniform(sycl::queue& queue,
                  Size count,
                  Type* dst,
-                 oneapi_engine<EngineType>& engine_,
+                 onedal_engine<EngineType>& engine_,
                  Type a,
                  Type b,
                  bool distr_mode = false,
@@ -149,18 +144,19 @@ class oneapi_rng {
     void uniform_gpu(sycl::queue& queue,
                      Size count,
                      Type* dst,
-                     oneapi_engine<EngineType>& engine_,
+                     onedal_engine<EngineType>& engine_,
                      Type a,
                      Type b,
                      const event_vector& deps = {});
 
     template <engine_list EngineType>
-    void uniform_cpu(Size count, Type* dst, oneapi_engine<EngineType>& engine_, Type a, Type b);
+    void uniform_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_, Type a, Type b);
+
     template <engine_list EngineType>
     void uniform_without_replacement(sycl::queue& queue,
                                      Size count,
                                      Type* dst,
-                                     oneapi_engine<EngineType>& engine_,
+                                     onedal_engine<EngineType>& engine_,
                                      Type a,
                                      Type b,
                                      const event_vector& deps = {}) {}
@@ -170,7 +166,7 @@ class oneapi_rng {
                                          Size count,
                                          Type* dst,
                                          Type* buff,
-                                         oneapi_engine<EngineType>& engine_,
+                                         onedal_engine<EngineType>& engine_,
                                          Type a,
                                          Type b,
                                          const event_vector& deps = {});
@@ -179,7 +175,7 @@ class oneapi_rng {
     void uniform_without_replacement_cpu(Size count,
                                          Type* dst,
                                          Type* buffer,
-                                         oneapi_engine<EngineType>& engine_,
+                                         onedal_engine<EngineType>& engine_,
                                          Type a,
                                          Type b) {
         void* state = engine_.get_cpu_engine_state();
@@ -195,7 +191,7 @@ class oneapi_rng {
     template <engine_list EngineType,
               typename T = Type,
               typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, oneapi_engine<EngineType>& engine_) {
+    void shuffle(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
         Type idx[2];
 
         void* state = engine_.get_cpu_engine_state();
@@ -211,13 +207,13 @@ class oneapi_rng {
     void shuffle_gpu(sycl::queue& queue,
                      Size count,
                      Type* dst,
-                     oneapi_engine<EngineType>& engine_,
+                     onedal_engine<EngineType>& engine_,
                      const event_vector& deps);
 
     template <engine_list EngineType,
               typename T = Type,
               typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, oneapi_engine<EngineType>& engine_) {
+    void shuffle_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
         Type idx[2];
 
         void* state = engine_.get_cpu_engine_state();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
new file mode 100644
index 00000000000..7c1691e5e85
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
@@ -0,0 +1,27 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <daal/include/algorithms/engines/mt2203/mt2203.h>
+#include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
+#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
+#include <daal/include/algorithms/engines/mt19937/mt19937.h>
+
+namespace oneapi::dal::backend::primitives {
+enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
+}
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index bdefc472a58..d3303dc61f6 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -64,83 +64,42 @@ constexpr auto engine_v = engine_map<engine_type>::value;
 template <typename TestType>
 class rng_test : public te::policy_fixture {
 public:
-    using Index = std::tuple_element_t<0, TestType>;
+    using DataType = std::tuple_element_t<0, TestType>;
     using EngineType = std::tuple_element_t<1, TestType>;
     static constexpr auto engine_qq = engine_v<EngineType>;
 
     auto get_rng() const {
-        oneapi_rng<Index> rn_gen;
+        oneapi_rng<DataType> rn_gen;
         return rn_gen;
     }
 
     auto get_engine(std::int64_t seed) {
-        auto rng_engine = oneapi_engine<engine_qq>(this->get_queue(), seed);
+        auto rng_engine = onedal_engine<engine_qq>(this->get_queue(), seed);
         return rng_engine;
     }
 
-    auto allocate_arrays(std::int64_t elem_count) {
-        auto& q = this->get_queue();
-        auto val_gpu = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
-        auto val_host = ndarray<Index, 1>::empty({ elem_count });
-
-        return std::make_tuple(val_gpu, val_host);
-    }
+    auto allocate_array_host(std::int64_t elem_count) {
+        auto arr_host = ndarray<DataType, 1>::empty({ elem_count });
 
-    auto allocate_arrays_shared(std::int64_t elem_count) {
-        auto& q = this->get_queue();
-        auto val_gpu = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::shared);
-        auto val_host = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::shared);
-
-        return std::make_tuple(val_gpu, val_host);
+        return arr_host;
     }
 
-    auto allocate_arrays_device(std::int64_t elem_count) {
+    auto allocate_array_device(std::int64_t elem_count) {
         auto& q = this->get_queue();
-        auto val_gpu_1 = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
-        auto val_gpu_2 = ndarray<Index, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
-
-        return std::make_tuple(val_gpu_1, val_gpu_2);
-    }
-
-    auto allocate_arrays_host(std::int64_t elem_count) {
-        auto val_host_1 = ndarray<Index, 1>::empty({ elem_count });
-        auto val_host_2 = ndarray<Index, 1>::empty({ elem_count });
-
-        return std::make_tuple(val_host_1, val_host_2);
-    }
-
-    void check_results_host(const ndarray<Index, 1>& val_host_1,
-                            const ndarray<Index, 1>& val_host_2) {
-        const Index* val_host_1_ptr = val_host_1.get_data();
+        auto arr_gpu = ndarray<DataType, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
 
-        const Index* val_host_2_ptr = val_host_2.get_data();
-
-        for (std::int64_t el = 0; el < val_host_1.get_count(); el++) {
-            REQUIRE(abs(val_host_1_ptr[el] - val_host_2_ptr[el]) < 1);
-        }
+        return arr_gpu;
     }
 
-    void check_results_device(const ndarray<Index, 1>& val_gpu_1,
-                              const ndarray<Index, 1>& val_gpu_2) {
-        const auto val_gpu_host_1 = val_gpu_1.to_host(this->get_queue());
-        const Index* val_gpu_host_1_ptr = val_gpu_host_1.get_data();
+    void check_results(const ndarray<DataType, 1>& arr_1, const ndarray<DataType, 1>& arr_2) {
+        const auto arr_1_host = arr_1.to_host(this->get_queue());
+        const DataType* val_arr_1_host_ptr = arr_1_host.get_data();
 
-        const auto val_gpu_host_2 = val_gpu_2.to_host(this->get_queue());
-        const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data();
+        const auto arr_2_host = arr_2.to_host(this->get_queue());
+        const DataType* val_arr_2_host_ptr = arr_2_host.get_data();
 
-        for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) {
-            REQUIRE(abs(val_gpu_host_2_ptr[el] - val_gpu_host_1_ptr[el]) < 1);
-        }
-    }
-
-    void check_results(const ndarray<Index, 1>& val_gpu, const ndarray<Index, 1>& val_host) {
-        const Index* val_host_ptr = val_host.get_data();
-
-        const auto val_gpu_host = val_gpu.to_host(this->get_queue());
-        const Index* val_gpu_host_ptr = val_gpu_host.get_data();
-
-        for (std::int64_t el = 0; el < val_host.get_count(); el++) {
-            REQUIRE(abs(val_gpu_host_ptr[el] - val_host_ptr[el]) < 1);
+        for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) {
+            REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 1);
         }
     }
 };
@@ -152,7 +111,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000);
     std::int64_t seed = GENERATE_COPY(777, 999);
 
-    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
@@ -166,8 +126,9 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10));
+using rng_types_skip_ahead_support = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10));
 
+//Just for perf tests
 // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
 //     std::int64_t elem_count =
@@ -192,13 +153,17 @@ using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4
 
 // }
 
-TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) {
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000);
     std::int64_t seed = GENERATE_COPY(777, 999);
 
-    auto [arr_host_init_1, arr_host_init_2] = this->allocate_arrays_host(elem_count);
-    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_host_init_1 = this->allocate_array_host(elem_count);
+    auto arr_host_init_2 = this->allocate_array_host(elem_count);
+
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
+
     auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data();
     auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data();
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
@@ -214,17 +179,21 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) {
     rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
     rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
-    this->check_results_host(arr_host_init_1, arr_host_init_2);
+    this->check_results(arr_host_init_1, arr_host_init_2);
     this->check_results(arr_gpu, arr_host);
 }
 
-TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) {
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
     std::int64_t seed = GENERATE_COPY(1, 777, 999);
 
-    auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_device(elem_count);
-    auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count);
+    auto arr_device_init_1 = this->allocate_array_device(elem_count);
+    auto arr_device_init_2 = this->allocate_array_device(elem_count);
+
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
+
     auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
     auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
@@ -250,10 +219,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) {
     rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
     rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
-    this->check_results_device(arr_device_init_1, arr_device_init_2);
+    this->check_results(arr_device_init_1, arr_device_init_2);
     this->check_results(arr_gpu, arr_host);
 }
 
+//TODO: add engine collection test + daal_engine tests
 // TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
 //     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);

From c55bcce40a96d788e3a332610e169ca2f7509874 Mon Sep 17 00:00:00 2001
From: "Solovev, Aleksandr" <aleksandr.solovev@intel.com>
Date: Mon, 18 Nov 2024 13:17:38 +0100
Subject: [PATCH 07/18] fixes

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.h    |  5 ++--
 .../engines/mrg32k3a/mrg32k3a_types.h         |  6 ++--
 .../engines/philox4x32x10/philox4x32x10.h     |  4 ++-
 .../philox4x32x10/philox4x32x10_types.h       |  6 ++--
 cpp/daal/include/daal.h                       |  6 ++--
 cpp/daal/include/daal_win.h                   |  6 ++--
 .../algorithms/engines/mrg32k3a/mrg32k3a.cpp  |  8 ++++--
 .../mrg32k3a/mrg32k3a_batch_container.h       |  3 +-
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    |  5 ++--
 .../mrg32k3a_dense_default_batch_fpt_cpu.cpp  |  3 +-
 ...k3a_dense_default_batch_fpt_dispatcher.cpp |  3 +-
 .../engines/mrg32k3a/mrg32k3a_impl.i          |  5 ++--
 .../engines/mrg32k3a/mrg32k3a_kernel.h        |  5 ++--
 .../engines/philox4x32x10/philox4x32x10.cpp   |  6 ++--
 .../philox4x32x10_batch_container.h           |  3 +-
 .../philox4x32x10/philox4x32x10_batch_impl.h  |  3 +-
 ...lox4x32x10_dense_default_batch_fpt_cpu.cpp |  3 +-
 ...x10_dense_default_batch_fpt_dispatcher.cpp |  3 +-
 .../philox4x32x10/philox4x32x10_impl.i        |  5 ++--
 .../philox4x32x10/philox4x32x10_kernel.h      |  5 ++--
 cpp/daal/src/externals/service_rng.h          | 28 +++++++++++--------
 cpp/daal/src/externals/service_rng_ref.h      |  1 +
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  2 +-
 .../dal/backend/primitives/rng/rng_types.hpp  |  2 ++
 24 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
index c35eb6a9f09..b794813a227 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -1,6 +1,7 @@
 /* file: mrg32k3a.h */
 /*******************************************************************************
 * Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,8 +18,8 @@
 
 /*
 //++
-//  Implementation of the 32-bit combined multiple recursive generator with two components of order 3
-//  in the batch processing mode.
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  with two components of order 3, optimized for batch processing.
 //--
 */
 
diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
index 77ca9656418..a6b9d699f77 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_types.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,8 @@
 
 /*
 //++
-//  Implementation of mrg32k3a engine.
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  with two components of order 3, optimized for batch processing.
 //--
 */
 
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
index 09eae5a7cd8..ec82723f1f8 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
@@ -1,6 +1,7 @@
 /* file: philox4x32x10.h */
 /*******************************************************************************
 * Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,8 @@
 
 /*
 //++
-//  Implementation of the Mersenne Twister engine in the batch processing mode
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 */
 
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
index d3da7ff32a9..74d2e884670 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_types.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,8 @@
 
 /*
 //++
-//  Implementation of philox4x32x10 engine.
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 */
 
diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h
index 375d9c0b3b8..f49625f8939 100755
--- a/cpp/daal/include/daal.h
+++ b/cpp/daal/include/daal.h
@@ -297,6 +297,9 @@
 #include "algorithms/distributions/bernoulli/bernoulli.h"
 #include "algorithms/distributions/bernoulli/bernoulli_types.h"
 #include "algorithms/engines/engine.h"
+#include "algorithms/engines/engine_family.h"
+#include "algorithms/engines/mt2203/mt2203.h"
+#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mt19937/mt19937.h"
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
@@ -305,9 +308,6 @@
 #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
 #include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
-#include "algorithms/engines/engine_family.h"
-#include "algorithms/engines/mt2203/mt2203.h"
-#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/dbscan/dbscan_types.h"
 #include "algorithms/dbscan/dbscan_batch.h"
 #include "algorithms/dbscan/dbscan_distributed.h"
diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h
index 87b1155a0a4..a15ed7db26e 100755
--- a/cpp/daal/include/daal_win.h
+++ b/cpp/daal/include/daal_win.h
@@ -309,6 +309,9 @@
 #include "algorithms/distributions/bernoulli/bernoulli.h"
 #include "algorithms/distributions/bernoulli/bernoulli_types.h"
 #include "algorithms/engines/engine.h"
+#include "algorithms/engines/engine_family.h"
+#include "algorithms/engines/mt2203/mt2203.h"
+#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mt19937/mt19937.h"
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
@@ -317,9 +320,6 @@
 #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
 #include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
-#include "algorithms/engines/engine_family.h"
-#include "algorithms/engines/mt2203/mt2203.h"
-#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/dbscan/dbscan_types.h"
 #include "algorithms/dbscan/dbscan_batch.h"
 #include "algorithms/dbscan/dbscan_distributed.h"
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
index 288cb0506ee..8f10b1e1e87 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
@@ -1,6 +1,7 @@
 /* file: mrg32k3a.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,12 +17,13 @@
 *******************************************************************************/
 
 //++
-//  Implementation of mrg32k3a engine
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  with two components of order 3, optimized for batch processing.
 //--
 
 #include "algorithms/engines/mrg32k3a/mrg32k3a.h"
-#include "src/externals/service_dispatch.h"
 #include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h"
+#include "src/externals/service_dispatch.h"
 
 namespace daal
 {
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
index 1fb8f9ca991..31126c4300f 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_batch_container.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
index bbe3cf2dcf9..251caf0d3de 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_batch_impl.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,7 @@
 
 /*
 //++
-//  Implementation of the class defining the mrg32k3a engine
+//  Implementation of the class defining the mrg32k3a engine.
 //--
 */
 
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
index 2af52dd0443..1d3820053bd 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
index 482486e243f..1b3f3c618e9 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
index 5e359ecaaa3..06d670f1f7a 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_impl.i */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,7 @@
 
 /*
 //++
-//  Implementation of mrg32k3a algorithm
+//  Implementation of mrg32k3a algorithm.
 //--
 */
 
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
index 3959576ccbe..86b8d929aae 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
@@ -1,6 +1,7 @@
 /* file: mrg32k3a_kernel.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,7 @@
 *******************************************************************************/
 
 //++
-//  Declaration of template function that calculate mrg32k3as.
+//  Declaration of a template function for calculating values using the MRG32k3a generator.
 //--
 
 #ifndef __mrg32k3a_KERNEL_H__
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
index 78b1014663a..969c135a875 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
@@ -1,6 +1,7 @@
 /* file: philox4x32x10.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,8 @@
 *******************************************************************************/
 
 //++
-//  Implementation of philox4x32x10 engine
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
index fcffa11e0d7..7a721c4f1a8 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_batch_container.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
index 45e7f759729..fdbc4bd97f0 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_batch_impl.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
index 712bd3f7300..e1ed7b4d896 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
index 225d9f02da1..1f79b94c762 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
index 6c113d179c8..9e2dc9f6b99 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_impl.i */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +18,7 @@
 
 /*
 //++
-//  Implementation of philox4x32x10 algorithm
+//  Implementation of philox4x32x10 algorithm.
 //--
 */
 
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
index 28b689a9ab8..47333a6c78f 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
@@ -1,6 +1,7 @@
 /* file: philox4x32x10_kernel.h */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,7 @@
 *******************************************************************************/
 
 //++
-//  Declaration of template function that calculate philox4x32x10s.
+//  Declaration of a template function for generating values using the Philox4x32-10 engine.
 //--
 
 #ifndef __philox4x32x10_KERNEL_H__
diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h
index d03c0f39abf..5974343a865 100644
--- a/cpp/daal/src/externals/service_rng.h
+++ b/cpp/daal/src/externals/service_rng.h
@@ -115,7 +115,12 @@ class RNGs
     int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b,
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
-        Type * buffer = (Type *)daal_malloc(sizeof(Type) * n);
+        SizeType sequence_size = abs(b-a);
+        Type * buffer = (Type *)daal_malloc(sizeof(Type) * sequence_size);
+        for (SizeType i = 0; i < sequence_size; i++)
+        {
+            buffer[i]=i;
+        }
         int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method);
         daal_free(buffer);
         return errorcode;
@@ -126,19 +131,18 @@ class RNGs
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
         int errorcode = 0;
+        SizeType sequence_size = abs(b-a);
+        DstType swapIdx;
+        for (SizeType i = 0; i < n; i++)
+        {
+            errorcode = uniform(1, &swapIdx, state, i, n - 1, method);
+            auto tmp = buffer[i];
+            buffer[i] = buffer[swapIdx];
+            buffer[swapIdx] = tmp;
+        }
         for (SizeType i = 0; i < n; i++)
         {
-            errorcode = uniform(1, buffer + i, state, a + i, b, method);
-            int value = buffer[i];
-
-            for (SizeType j = i; j > 0; j--)
-            {
-                if (value == buffer[j - 1])
-                {
-                    value = (DstType)(j - 1 + a);
-                }
-            }
-            r[i] = value;
+            r[i] = buffer[i];
         }
         return errorcode;
     }
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index 0445614b3e3..590f0bb2aae 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -41,6 +41,7 @@
     #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
     #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 3 //VSL_BRNG_MRG32K3A
     #define __DAAL_BRNG_PHILOX4X32X10  (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
+
     #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
     #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
     #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 68a8eabaa0a..c509db057cc 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -189,4 +189,4 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
 INSTANTIATE_SHUFFLE_FLOAT(std::int32_t);
 
-} // namespace oneapi::dal::backend::primitives
\ No newline at end of file
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
index 7c1691e5e85..d502e9282ee 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
@@ -23,5 +23,7 @@
 #include <daal/include/algorithms/engines/mt19937/mt19937.h>
 
 namespace oneapi::dal::backend::primitives {
+
 enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
+
 }

From 806a74ce28e09b86545015614acd909c6b7c8524 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Mon, 18 Nov 2024 06:06:51 -0800
Subject: [PATCH 08/18] clang + fisher yates

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.cpp  |  2 +-
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    |  3 +-
 .../engines/philox4x32x10/philox4x32x10.cpp   |  2 +-
 .../philox4x32x10/philox4x32x10_batch_impl.h  |  3 +-
 cpp/daal/src/externals/service_rng.h          | 32 ++++---
 cpp/daal/src/externals/service_rng_ref.h      | 10 +-
 .../gpu/train_kernel_hist_impl_dpc.cpp        |  7 +-
 .../dal/backend/primitives/rng/rng_cpu.hpp    |  7 +-
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 96 +++++++++----------
 .../dal/backend/primitives/rng/rng_gpu.hpp    | 12 ++-
 .../backend/primitives/rng/test/rng_dpc.cpp   | 56 ++++++-----
 11 files changed, 125 insertions(+), 105 deletions(-)

diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
index 8f10b1e1e87..fe015c85428 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
@@ -17,7 +17,7 @@
 *******************************************************************************/
 
 //++
-//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
 //  with two components of order 3, optimized for batch processing.
 //--
 
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
index 251caf0d3de..0ff55f39b62 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -41,7 +41,8 @@ namespace mrg32k3a
 namespace internal
 {
 template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
-class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method>, public algorithms::engines::internal::BatchBaseImpl
+class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method>,
+                  public algorithms::engines::internal::BatchBaseImpl
 {
 public:
     typedef algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method> super1;
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
index 969c135a875..c103a4ae068 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
@@ -17,7 +17,7 @@
 *******************************************************************************/
 
 //++
-//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
 //  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
index fdbc4bd97f0..f6a9f35e268 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -41,7 +41,8 @@ namespace philox4x32x10
 namespace internal
 {
 template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
-class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method>, public algorithms::engines::internal::BatchBaseImpl
+class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method>,
+                  public algorithms::engines::internal::BatchBaseImpl
 {
 public:
     typedef algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method> super1;
diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h
index 5974343a865..ba95f8f7324 100644
--- a/cpp/daal/src/externals/service_rng.h
+++ b/cpp/daal/src/externals/service_rng.h
@@ -115,12 +115,7 @@ class RNGs
     int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b,
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
-        SizeType sequence_size = abs(b-a);
-        Type * buffer = (Type *)daal_malloc(sizeof(Type) * sequence_size);
-        for (SizeType i = 0; i < sequence_size; i++)
-        {
-            buffer[i]=i;
-        }
+        Type * buffer = (Type *)daal_malloc(sizeof(Type) * 1);
         int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method);
         daal_free(buffer);
         return errorcode;
@@ -130,19 +125,28 @@ class RNGs
     int uniformWithoutReplacement(const SizeType n, DstType * r, Type * buffer, void * state, const Type a, const Type b,
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
-        int errorcode = 0;
-        SizeType sequence_size = abs(b-a);
-        DstType swapIdx;
+        int errorcode          = 0;
+        SizeType sequence_size = abs(b - a);
+        if (sequence_size < n)
+        {
+            return -1;
+        }
+        Type * buffer_ = (Type *)daal_malloc(sizeof(Type) * sequence_size);
+        for (SizeType i = 0; i < sequence_size; i++)
+        {
+            buffer_[i] = i;
+        }
+        Type swapIdx;
         for (SizeType i = 0; i < n; i++)
         {
-            errorcode = uniform(1, &swapIdx, state, i, n - 1, method);
-            auto tmp = buffer[i];
-            buffer[i] = buffer[swapIdx];
-            buffer[swapIdx] = tmp;
+            errorcode = uniform(1, &swapIdx, state, i, sequence_size, method);
+            int index = int(swapIdx);
+
+            std::swap(buffer_[i], buffer_[index]);
         }
         for (SizeType i = 0; i < n; i++)
         {
-            r[i] = buffer[i];
+            r[i] = buffer_[i];
         }
         return errorcode;
     }
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index 590f0bb2aae..6b7aa53359e 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -36,11 +36,11 @@
     #include <random>
 
     // RNGs
-    #define __DAAL_BRNG_MT2203  (1 << 20) * 9 //VSL_BRNG_MT2203
-    #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937
-    #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
-    #define __DAAL_BRNG_MRG32K3A  (1 << 20) * 3 //VSL_BRNG_MRG32K3A
-    #define __DAAL_BRNG_PHILOX4X32X10  (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
+    #define __DAAL_BRNG_MT2203        (1 << 20) * 9  //VSL_BRNG_MT2203
+    #define __DAAL_BRNG_MT19937       (1 << 20) * 8  //VSL_BRNG_MT19937
+    #define __DAAL_BRNG_MCG59         (1 << 20) * 4  //VSL_BRNG_MCG59
+    #define __DAAL_BRNG_MRG32K3A      (1 << 20) * 3  //VSL_BRNG_MRG32K3A
+    #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
 
     #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
     #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index 42355b6caf5..c0ee89c4d64 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -1663,10 +1663,9 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
             pr::daal_rng<Index> rn_gen;
 
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                rn_gen.shuffle(
-                    oob_row_count,
-                    permutation_ptr,
-                    engine_arr[built_tree_count + tree_idx_in_block]);
+                rn_gen.shuffle(oob_row_count,
+                               permutation_ptr,
+                               engine_arr[built_tree_count + tree_idx_in_block]);
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
index 2cb18c72c1f..6c602aa6612 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
@@ -68,8 +68,7 @@ class daal_engine {
         switch (EngineType) {
             case engine_list::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
-            case engine_list::mcg59:
-                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
             case engine_list::mrg32k3a:
                 return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
             case engine_list::philox4x32x10:
@@ -112,7 +111,9 @@ class daal_rng {
                                                                      b);
     }
 
-    template <engine_list EngineType, typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
+    template <engine_list EngineType,
+              typename T = Type,
+              typename = std::enable_if_t<std::is_integral_v<T>>>
     void shuffle(Size count, Type* dst, daal_engine<EngineType> daal_engine) {
         Type idx[2];
         auto state = daal_engine.get_cpu_engine_state();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index c509db057cc..1fa9e36a679 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -92,21 +92,21 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
         F b,                                                      \
         const event_vector& deps);
 
-#define INSTANTIATE_FLOAT_(Size)                     \
-    INSTANTIATE_(float, Size, engine_list::mt2203)   \
-    INSTANTIATE_(float, Size, engine_list::mcg59)    \
-    INSTANTIATE_(float, Size, engine_list::mrg32k3a) \
-    INSTANTIATE_(float, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_(float, Size, engine_list::mt19937)  \
-    INSTANTIATE_(double, Size, engine_list::mt2203)  \
-    INSTANTIATE_(double, Size, engine_list::mcg59)   \
-    INSTANTIATE_(double, Size, engine_list::mrg32k3a)   \
-    INSTANTIATE_(double, Size, engine_list::philox4x32x10)   \
-    INSTANTIATE_(double, Size, engine_list::mt19937) \
-    INSTANTIATE_(int, Size, engine_list::mt2203)     \
-    INSTANTIATE_(int, Size, engine_list::mcg59)      \
-    INSTANTIATE_(int, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_(int, Size, engine_list::philox4x32x10)      \
+#define INSTANTIATE_FLOAT_(Size)                           \
+    INSTANTIATE_(float, Size, engine_list::mt2203)         \
+    INSTANTIATE_(float, Size, engine_list::mcg59)          \
+    INSTANTIATE_(float, Size, engine_list::mrg32k3a)       \
+    INSTANTIATE_(float, Size, engine_list::philox4x32x10)  \
+    INSTANTIATE_(float, Size, engine_list::mt19937)        \
+    INSTANTIATE_(double, Size, engine_list::mt2203)        \
+    INSTANTIATE_(double, Size, engine_list::mcg59)         \
+    INSTANTIATE_(double, Size, engine_list::mrg32k3a)      \
+    INSTANTIATE_(double, Size, engine_list::philox4x32x10) \
+    INSTANTIATE_(double, Size, engine_list::mt19937)       \
+    INSTANTIATE_(int, Size, engine_list::mt2203)           \
+    INSTANTIATE_(int, Size, engine_list::mcg59)            \
+    INSTANTIATE_(int, Size, engine_list::mrg32k3a)         \
+    INSTANTIATE_(int, Size, engine_list::philox4x32x10)    \
     INSTANTIATE_(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_(std::int64_t);
@@ -120,21 +120,21 @@ INSTANTIATE_FLOAT_(std::int32_t);
         F a,                                                      \
         F b);
 
-#define INSTANTIATE_FLOAT_CPU(Size)                     \
-    INSTANTIATE_CPU(float, Size, engine_list::mt2203)   \
-    INSTANTIATE_CPU(float, Size, engine_list::mcg59)    \
-    INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a)    \
-    INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10)    \
-    INSTANTIATE_CPU(float, Size, engine_list::mt19937)  \
-    INSTANTIATE_CPU(double, Size, engine_list::mt2203)  \
-    INSTANTIATE_CPU(double, Size, engine_list::mcg59)   \
-    INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a)   \
-    INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10)   \
-    INSTANTIATE_CPU(double, Size, engine_list::mt19937) \
-    INSTANTIATE_CPU(int, Size, engine_list::mt2203)     \
-    INSTANTIATE_CPU(int, Size, engine_list::mcg59)      \
-    INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10)      \
+#define INSTANTIATE_FLOAT_CPU(Size)                           \
+    INSTANTIATE_CPU(float, Size, engine_list::mt2203)         \
+    INSTANTIATE_CPU(float, Size, engine_list::mcg59)          \
+    INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a)       \
+    INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10)  \
+    INSTANTIATE_CPU(float, Size, engine_list::mt19937)        \
+    INSTANTIATE_CPU(double, Size, engine_list::mt2203)        \
+    INSTANTIATE_CPU(double, Size, engine_list::mcg59)         \
+    INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a)      \
+    INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \
+    INSTANTIATE_CPU(double, Size, engine_list::mt19937)       \
+    INSTANTIATE_CPU(int, Size, engine_list::mt2203)           \
+    INSTANTIATE_CPU(int, Size, engine_list::mcg59)            \
+    INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a)         \
+    INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10)    \
     INSTANTIATE_CPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_FLOAT_CPU(std::int64_t);
@@ -151,21 +151,21 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
         F b,                                                                          \
         const event_vector& deps);
 
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                     \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)   \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)    \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)  \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)  \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)   \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                           \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)          \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)       \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10)  \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a)      \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)     \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)      \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937)       \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)           \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)            \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10)    \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
@@ -179,10 +179,10 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
         onedal_engine<EngineType>& engine_,                       \
         const event_vector& deps);
 
-#define INSTANTIATE_SHUFFLE_FLOAT(Size)                   \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)   \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)    \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                        \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)        \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)         \
+    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a)      \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
 
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
index a68df41c541..8e62ca88c1b 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
@@ -101,8 +101,10 @@ class onedal_engine {
             case engine_list::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
             case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
-            case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
-            case engine_list::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
+            case engine_list::mrg32k3a:
+                return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
+            case engine_list::philox4x32x10:
+                return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
             case engine_list::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
@@ -111,8 +113,10 @@ class onedal_engine {
 
     onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) {
         if constexpr (EngineType == engine_list::mt2203) {
-            return onedal_engine_t(queue, seed,
-                                   0); // Aligns CPU and GPU results for mt2203, impacts the performance.
+            return onedal_engine_t(
+                queue,
+                seed,
+                0); // Aligns CPU and GPU results for mt2203, impacts the performance.
         }
         else {
             return onedal_engine_t(queue, seed);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index d3303dc61f6..719fe429411 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -73,6 +73,16 @@ class rng_test : public te::policy_fixture {
         return rn_gen;
     }
 
+    auto get_daal_rng() const {
+        daal_rng<DataType> rn_gen;
+        return rn_gen;
+    }
+
+    auto get_daal_engine(std::int64_t seed) {
+        auto rng_engine = daal_engine<engine_qq>(seed);
+        return rng_engine;
+    }
+
     auto get_engine(std::int64_t seed) {
         auto rng_engine = onedal_engine<engine_qq>(this->get_queue(), seed);
         return rng_engine;
@@ -126,32 +136,32 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip_ahead_support = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10));
+using rng_types_skip_ahead_support = COMBINE_TYPES((float),
+                                                   (mt19937, mcg59, mrg32k3a, philox4x32x10));
 
 //Just for perf tests
-// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) {
-//     SKIP_IF(this->get_policy().is_cpu());
-//     std::int64_t elem_count =
-//         GENERATE_COPY(6100000000, 1LL * 64 * 1000000);
-//     std::int64_t seed = GENERATE_COPY(777);
-
-
-//     auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count);
-//     auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data();
-
-//     auto rn_gen_ = this->get_rng();
-//     auto rng_engine_1 = this->get_engine(seed);
-
-//     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
-//         rn_gen_.uniform_gpu(this->get_queue(),
-//                                      elem_count,
-//                                      arr_gpu_ptr_,
-//                                      rng_engine_1,
-//                                      0,
-//                                      elem_count);
-//     };
+TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) {
+    SKIP_IF(this->get_policy().is_cpu());
+    std::int64_t elem_count = GENERATE_COPY(10000);
+    std::int64_t seed = GENERATE_COPY(777);
 
-// }
+    auto arr_host = this->allocate_array_host(elem_count);
+    auto arr_host_ptr_ = arr_host.get_mutable_data();
+
+    auto arr_host_fake = this->allocate_array_host(1);
+    auto arr_host_ptr_fake = arr_host_fake.get_mutable_data();
+    auto rn_gen_ = this->get_daal_rng();
+    auto rng_engine_1 = this->get_daal_engine(seed);
+
+    BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
+        rn_gen_.uniform_without_replacement_cpu(elem_count,
+                                                arr_host_ptr_,
+                                                arr_host_ptr_fake,
+                                                rng_engine_1,
+                                                0,
+                                                elem_count);
+    };
+}
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());

From cc85e37f1adb45066551995232a374a2c9aed1f7 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 19 Nov 2024 03:22:40 -0800
Subject: [PATCH 09/18] refactoring

---
 .../rng/{rng_cpu.hpp => engine_cpu.hpp}       |  41 -----
 .../rng/{rng_gpu.hpp => engine_gpu.hpp}       | 106 +------------
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 132 +++++++++++++++-
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 142 +++++++-----------
 .../backend/primitives/rng/test/rng_dpc.cpp   |  11 +-
 5 files changed, 193 insertions(+), 239 deletions(-)
 rename cpp/oneapi/dal/backend/primitives/rng/{rng_cpu.hpp => engine_cpu.hpp} (65%)
 rename cpp/oneapi/dal/backend/primitives/rng/{rng_gpu.hpp => engine_gpu.hpp} (51%)

diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp
similarity index 65%
rename from cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
rename to cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp
index 6c602aa6612..e8286f83051 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp
@@ -83,45 +83,4 @@ class daal_engine {
     daal::algorithms::engines::internal::BatchBaseImpl* impl_;
 };
 
-template <typename Type, typename Size = std::int64_t>
-class daal_rng {
-public:
-    daal_rng() = default;
-    ~daal_rng() = default;
-
-    template <engine_list EngineType>
-    void uniform(Size count, Type* dst, daal_engine<EngineType> daal_engine, Type a, Type b) {
-        auto state = daal_engine.get_cpu_engine_state();
-        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
-    }
-
-    template <engine_list EngineType>
-    void uniform_without_replacement_cpu(Size count,
-                                         Type* dst,
-                                         Type* buffer,
-                                         daal_engine<EngineType> daal_engine,
-                                         Type a,
-                                         Type b) {
-        auto state = daal_engine.get_cpu_engine_state();
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
-    }
-
-    template <engine_list EngineType,
-              typename T = Type,
-              typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, daal_engine<EngineType> daal_engine) {
-        Type idx[2];
-        auto state = daal_engine.get_cpu_engine_state();
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
-    }
-};
-
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp
similarity index 51%
rename from cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
rename to cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp
index 8e62ca88c1b..242f71cea65 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp
@@ -95,6 +95,10 @@ class onedal_engine {
         }
     }
 
+    sycl::queue& get_queue() {
+        return q;
+    }
+
 private:
     daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
         switch (EngineType) {
@@ -128,107 +132,5 @@ class onedal_engine {
     daal::algorithms::engines::internal::BatchBaseImpl* impl_;
 };
 
-template <typename Type, typename Size = std::int64_t>
-class oneapi_rng {
-public:
-    oneapi_rng() = default;
-    ~oneapi_rng() = default;
-
-    template <engine_list EngineType>
-    void uniform(sycl::queue& queue,
-                 Size count,
-                 Type* dst,
-                 onedal_engine<EngineType>& engine_,
-                 Type a,
-                 Type b,
-                 bool distr_mode = false,
-                 const event_vector& deps = {});
-
-    template <engine_list EngineType>
-    void uniform_gpu(sycl::queue& queue,
-                     Size count,
-                     Type* dst,
-                     onedal_engine<EngineType>& engine_,
-                     Type a,
-                     Type b,
-                     const event_vector& deps = {});
-
-    template <engine_list EngineType>
-    void uniform_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_, Type a, Type b);
-
-    template <engine_list EngineType>
-    void uniform_without_replacement(sycl::queue& queue,
-                                     Size count,
-                                     Type* dst,
-                                     onedal_engine<EngineType>& engine_,
-                                     Type a,
-                                     Type b,
-                                     const event_vector& deps = {}) {}
-
-    template <engine_list EngineType>
-    void uniform_without_replacement_gpu(sycl::queue& queue,
-                                         Size count,
-                                         Type* dst,
-                                         Type* buff,
-                                         onedal_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b,
-                                         const event_vector& deps = {});
-
-    template <engine_list EngineType>
-    void uniform_without_replacement_cpu(Size count,
-                                         Type* dst,
-                                         Type* buffer,
-                                         onedal_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b) {
-        void* state = engine_.get_cpu_engine_state();
-        engine_.skip_ahead_gpu(count);
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
-    }
-
-    template <engine_list EngineType,
-              typename T = Type,
-              typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
-        Type idx[2];
-
-        void* state = engine_.get_cpu_engine_state();
-        engine_.skip_ahead_gpu(count);
-
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
-    }
-
-    template <engine_list EngineType>
-    void shuffle_gpu(sycl::queue& queue,
-                     Size count,
-                     Type* dst,
-                     onedal_engine<EngineType>& engine_,
-                     const event_vector& deps);
-
-    template <engine_list EngineType,
-              typename T = Type,
-              typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
-        Type idx[2];
-
-        void* state = engine_.get_cpu_engine_state();
-        engine_.skip_ahead_gpu(count);
-
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
-    }
-};
-
 #endif
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index a89ca3d4505..b93729dbdf7 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -16,10 +16,138 @@
 
 #pragma once
 
-#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp"
+#include "oneapi/dal/backend/primitives/rng/engine_cpu.hpp"
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-#include "oneapi/dal/backend/primitives/rng/rng_gpu.hpp"
+#include "oneapi/dal/backend/primitives/rng/engine_gpu.hpp"
 
 #endif
+
+namespace oneapi::dal::backend::primitives {
+template <typename Type, typename Size = std::int64_t>
+class rng {
+public:
+    rng() = default;
+    ~rng() = default;
+
+    template <engine_list EngineType>
+    void uniform_cpu(Size count, Type* dst, daal_engine<EngineType> daal_engine, Type a, Type b) {
+        auto state = daal_engine.get_cpu_engine_state();
+        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+    }
+
+#ifdef ONEDAL_DATA_PARALLEL
+    template <engine_list EngineType>
+    void uniform_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_, Type a, Type b) {
+        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+            sycl::usm::alloc::device) {
+            throw domain_error(dal::detail::error_messages::unsupported_data_type());
+        }
+        auto state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+    }
+#endif
+
+    template <engine_list EngineType>
+    void uniform_without_replacement_cpu(Size count,
+                                         Type* dst,
+                                         Type* buffer,
+                                         daal_engine<EngineType> daal_engine,
+                                         Type a,
+                                         Type b) {
+        auto state = daal_engine.get_cpu_engine_state();
+        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
+                                                                     dst,
+                                                                     buffer,
+                                                                     state,
+                                                                     a,
+                                                                     b);
+    }
+#ifdef ONEDAL_DATA_PARALLEL
+    template <engine_list EngineType>
+    void uniform_without_replacement_cpu(Size count,
+                                         Type* dst,
+                                         Type* buffer,
+                                         onedal_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b) {
+        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+            sycl::usm::alloc::device) {
+            throw domain_error(dal::detail::error_messages::unsupported_data_type());
+        }
+        void* state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
+                                                                     dst,
+                                                                     buffer,
+                                                                     state,
+                                                                     a,
+                                                                     b);
+    }
+#endif
+
+    template <engine_list EngineType,
+              typename T = Type,
+              typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle_cpu(Size count, Type* dst, daal_engine<EngineType> daal_engine) {
+        Type idx[2];
+        auto state = daal_engine.get_cpu_engine_state();
+        for (Size i = 0; i < count; ++i) {
+            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+            std::swap(dst[idx[0]], dst[idx[1]]);
+        }
+    }
+
+#ifdef ONEDAL_DATA_PARALLEL
+    template <engine_list EngineType,
+              typename T = Type,
+              typename = std::enable_if_t<std::is_integral_v<T>>>
+    void shuffle_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
+        Type idx[2];
+        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+            sycl::usm::alloc::device) {
+            throw domain_error(dal::detail::error_messages::unsupported_data_type());
+        }
+        void* state = engine_.get_cpu_engine_state();
+        engine_.skip_ahead_gpu(count);
+
+        for (Size i = 0; i < count; ++i) {
+            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+            std::swap(dst[idx[0]], dst[idx[1]]);
+        }
+    }
+#endif
+
+#ifdef ONEDAL_DATA_PARALLEL
+    template <engine_list EngineType>
+    void uniform_gpu(sycl::queue& queue,
+                     Size count,
+                     Type* dst,
+                     onedal_engine<EngineType>& engine_,
+                     Type a,
+                     Type b,
+                     const event_vector& deps = {});
+
+    template <engine_list EngineType>
+    void uniform_without_replacement_gpu(sycl::queue& queue,
+                                         Size count,
+                                         Type* dst,
+                                         Type* buffer,
+                                         onedal_engine<EngineType>& engine_,
+                                         Type a,
+                                         Type b,
+                                         const event_vector& deps = {});
+
+    template <engine_list EngineType>
+    void shuffle_gpu(sycl::queue& queue,
+                     Size count,
+                     Type* dst,
+                     onedal_engine<EngineType>& engine_,
+                     const event_vector& deps = {});
+};
+
+#endif
+
+}; // namespace oneapi::dal::backend::primitives
\ No newline at end of file
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 1fa9e36a679..2e3a0c962c8 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -24,55 +24,55 @@ namespace bk = oneapi::dal::backend;
 
 template <typename Type, typename Size>
 template <engine_list EngineType>
-void oneapi_rng<Type, Size>::uniform_gpu(sycl::queue& queue,
-                                         Size count,
-                                         Type* dst,
-                                         onedal_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b,
-                                         const event_vector& deps) {
+void rng<Type, Size>::uniform_gpu(sycl::queue& queue,
+                                  Size count,
+                                  Type* dst,
+                                  onedal_engine<EngineType>& engine_,
+                                  Type a,
+                                  Type b,
+                                  const event_vector& deps) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
     oneapi::mkl::rng::uniform<Type> distr(a, b);
     auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps });
     event.wait_and_throw();
     engine_.skip_ahead_cpu(count);
 }
 
+//Currently only CPU impl
 template <typename Type, typename Size>
 template <engine_list EngineType>
-void oneapi_rng<Type, Size>::uniform_cpu(Size count,
-                                         Type* dst,
-                                         onedal_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b) {
-    void* state = engine_.get_cpu_engine_state();
-    engine_.skip_ahead_gpu(count);
-    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
-}
-
-template <typename Type, typename Size>
-template <engine_list EngineType>
-void oneapi_rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
-                                                             Size count,
-                                                             Type* dst,
-                                                             Type* buffer,
-                                                             onedal_engine<EngineType>& engine_,
-                                                             Type a,
-                                                             Type b,
-                                                             const event_vector& deps) {
+void rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
+                                                      Size count,
+                                                      Type* dst,
+                                                      Type* buffer,
+                                                      onedal_engine<EngineType>& engine_,
+                                                      Type a,
+                                                      Type b,
+                                                      const event_vector& deps) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
     void* state = engine_.get_cpu_engine_state();
     engine_.skip_ahead_gpu(count);
     uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
 }
 
+//Currently only CPU impl
 template <typename Type, typename Size>
 template <engine_list EngineType>
-void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
-                                         Size count,
-                                         Type* dst,
-                                         onedal_engine<EngineType>& engine_,
-                                         const event_vector& deps) {
+void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
+                                  Size count,
+                                  Type* dst,
+                                  onedal_engine<EngineType>& engine_,
+                                  const event_vector& deps) {
     Type idx[2];
-
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
     void* state = engine_.get_cpu_engine_state();
     engine_.skip_ahead_gpu(count);
 
@@ -82,15 +82,14 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
     }
 }
 
-#define INSTANTIATE_(F, Size, EngineType)                         \
-    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_gpu( \
-        sycl::queue& queue,                                       \
-        Size count_,                                              \
-        F* dst,                                                   \
-        onedal_engine<EngineType>& engine_,                       \
-        F a,                                                      \
-        F b,                                                      \
-        const event_vector& deps);
+#define INSTANTIATE_(F, Size, EngineType)                                                     \
+    template ONEDAL_EXPORT void rng<F, Size>::uniform_gpu(sycl::queue& queue,                 \
+                                                          Size count_,                        \
+                                                          F* dst,                             \
+                                                          onedal_engine<EngineType>& engine_, \
+                                                          F a,                                \
+                                                          F b,                                \
+                                                          const event_vector& deps);
 
 #define INSTANTIATE_FLOAT_(Size)                           \
     INSTANTIATE_(float, Size, engine_list::mt2203)         \
@@ -112,43 +111,15 @@ void oneapi_rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
 INSTANTIATE_FLOAT_(std::int64_t);
 INSTANTIATE_FLOAT_(std::int32_t);
 
-#define INSTANTIATE_CPU(F, Size, EngineType)                      \
-    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_cpu( \
-        Size count_,                                              \
-        F* dst,                                                   \
-        onedal_engine<EngineType>& engine_,                       \
-        F a,                                                      \
-        F b);
-
-#define INSTANTIATE_FLOAT_CPU(Size)                           \
-    INSTANTIATE_CPU(float, Size, engine_list::mt2203)         \
-    INSTANTIATE_CPU(float, Size, engine_list::mcg59)          \
-    INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a)       \
-    INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10)  \
-    INSTANTIATE_CPU(float, Size, engine_list::mt19937)        \
-    INSTANTIATE_CPU(double, Size, engine_list::mt2203)        \
-    INSTANTIATE_CPU(double, Size, engine_list::mcg59)         \
-    INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_CPU(double, Size, engine_list::mt19937)       \
-    INSTANTIATE_CPU(int, Size, engine_list::mt2203)           \
-    INSTANTIATE_CPU(int, Size, engine_list::mcg59)            \
-    INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a)         \
-    INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10)    \
-    INSTANTIATE_CPU(int, Size, engine_list::mt19937)
-
-INSTANTIATE_FLOAT_CPU(std::int64_t);
-INSTANTIATE_FLOAT_CPU(std::int32_t);
-
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)              \
-    template ONEDAL_EXPORT void oneapi_rng<F, Size>::uniform_without_replacement_gpu( \
-        sycl::queue& queue,                                                           \
-        Size count_,                                                                  \
-        F* dst,                                                                       \
-        F* buff,                                                                      \
-        onedal_engine<EngineType>& engine_,                                           \
-        F a,                                                                          \
-        F b,                                                                          \
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)       \
+    template ONEDAL_EXPORT void rng<F, Size>::uniform_without_replacement_gpu( \
+        sycl::queue& queue,                                                    \
+        Size count_,                                                           \
+        F* dst,                                                                \
+        F* buff,                                                               \
+        onedal_engine<EngineType>& engine_,                                    \
+        F a,                                                                   \
+        F b,                                                                   \
         const event_vector& deps);
 
 #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                           \
@@ -171,13 +142,12 @@ INSTANTIATE_FLOAT_CPU(std::int32_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
 
-#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                  \
-    template ONEDAL_EXPORT void oneapi_rng<F, Size>::shuffle_gpu( \
-        sycl::queue& queue,                                       \
-        Size count_,                                              \
-        F* dst,                                                   \
-        onedal_engine<EngineType>& engine_,                       \
-        const event_vector& deps);
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                                              \
+    template ONEDAL_EXPORT void rng<F, Size>::shuffle_gpu(sycl::queue& queue,                 \
+                                                          Size count_,                        \
+                                                          F* dst,                             \
+                                                          onedal_engine<EngineType>& engine_, \
+                                                          const event_vector& deps);
 
 #define INSTANTIATE_SHUFFLE_FLOAT(Size)                        \
     INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)        \
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 719fe429411..411894bdad4 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -69,12 +69,7 @@ class rng_test : public te::policy_fixture {
     static constexpr auto engine_qq = engine_v<EngineType>;
 
     auto get_rng() const {
-        oneapi_rng<DataType> rn_gen;
-        return rn_gen;
-    }
-
-    auto get_daal_rng() const {
-        daal_rng<DataType> rn_gen;
+        rng<DataType> rn_gen;
         return rn_gen;
     }
 
@@ -150,8 +145,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_s
 
     auto arr_host_fake = this->allocate_array_host(1);
     auto arr_host_ptr_fake = arr_host_fake.get_mutable_data();
-    auto rn_gen_ = this->get_daal_rng();
-    auto rng_engine_1 = this->get_daal_engine(seed);
+    auto rn_gen_ = this->get_rng();
+    auto rng_engine_1 = this->get_engine(seed);
 
     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
         rn_gen_.uniform_without_replacement_cpu(elem_count,

From 852669fa4cc58828c4d703ffa1c8eda1a4769551 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 19 Nov 2024 05:18:57 -0800
Subject: [PATCH 10/18] fixes

---
 .../vertex_partitioning_default_kernel.hpp    |  4 +--
 .../gpu/train_kernel_hist_impl_dpc.cpp        | 34 +++++++++----------
 .../algo/louvain/backend/cpu/louvain_data.hpp |  2 +-
 .../vertex_partitioning_default_kernel.hpp    |  2 +-
 .../objective_function/test/fixture.hpp       |  4 +--
 .../objective_function/test/spmd_fixture.hpp  |  2 +-
 .../optimizers/test/cg_solver_dpc.cpp         |  4 +--
 .../primitives/optimizers/test/fixture.hpp    |  6 ++--
 .../optimizers/test/newton_cg_dpc.cpp         | 10 +++---
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp |  3 +-
 10 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index 55087df26af..c33575c472d 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -91,8 +91,8 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
     dal::backend::primitives::daal_engine eng;
-    dal::backend::primitives::daal_rng<std::int32_t> rn_gen;
-    rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count);
+    dal::backend::primitives::rng<std::int32_t> rn_gen;
+    rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index c0ee89c4d64..193f731ffd4 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -396,14 +396,14 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
         Index* const node_list_ptr = node_list_host.get_mutable_data();
 
         for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-            pr::daal_rng<Index> rn_gen;
+            pr::rng<Index> rn_gen;
             Index* gen_row_idx_global_ptr =
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
-            rn_gen.uniform(ctx.selected_row_total_count_,
-                           gen_row_idx_global_ptr,
-                           rng_engine_list[engine_offset + node_idx],
-                           0,
-                           ctx.row_total_count_);
+            rn_gen.uniform_cpu(ctx.selected_row_total_count_,
+                               gen_row_idx_global_ptr,
+                               rng_engine_list[engine_offset + node_idx],
+                               0,
+                               ctx.row_total_count_);
 
             if (ctx.distr_mode_) {
                 Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
@@ -483,7 +483,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_);
 
-    pr::daal_rng<Index> rn_gen;
+    pr::rng<Index> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
     if (ctx.selected_ftr_count_ != ctx.column_count_) {
         for (Index node = 0; node < node_count; ++node) {
@@ -524,7 +524,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_);
 
-    pr::daal_rng<Float> rn_gen;
+    pr::rng<Float> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
 
     // Create arrays for random generated bins
@@ -537,11 +537,11 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     // Generate random bins for selected features
     for (Index node = 0; node < node_count; ++node) {
-        rn_gen.uniform(ctx.selected_ftr_count_,
-                       random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                       rng_engine_list[tree_map_ptr[node]],
-                       0.0f,
-                       1.0f);
+        rn_gen.uniform_cpu(ctx.selected_ftr_count_,
+                           random_bins_host_ptr + node * ctx.selected_ftr_count_,
+                           rng_engine_list[tree_map_ptr[node]],
+                           0.0f,
+                           1.0f);
     }
     auto event_rnd_generate =
         random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count());
@@ -1660,12 +1660,12 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
 
             const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1);
 
-            pr::daal_rng<Index> rn_gen;
+            pr::rng<Index> rn_gen;
 
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                rn_gen.shuffle(oob_row_count,
-                               permutation_ptr,
-                               engine_arr[built_tree_count + tree_idx_in_block]);
+                rn_gen.shuffle_cpu(oob_row_count,
+                                   permutation_ptr,
+                                   engine_arr[built_tree_count + tree_idx_in_block]);
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index d2751b3840b..98d4bf60047 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -124,7 +124,7 @@ struct louvain_data {
     value_type m;
 
     daal_engine<engine_list::mt2203> eng;
-    daal_rng<std::int32_t> rn_gen;
+    rng<std::int32_t> rn_gen;
 
     const std::int64_t vertex_count;
     const std::int64_t edge_count;
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index ff78f06f833..e758d769a01 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
+    ld.rn_gen.uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index d673470b042..c0bd5049153 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -572,13 +572,13 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
         const std::int64_t p = hessian_host.get_dimension(0) - 1;
         const std::int64_t dim = fit_intercept ? p + 1 : p;
 
-        primitives::daal_rng<float_t> rn_gen;
+        primitives::rng<float_t> rn_gen;
         auto vec_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host);
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
             primitives::daal_engine eng(2007 + dim * num_checks + ij);
-            rn_gen.uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
+            rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index e2a611c2c98..f90aa3d8a87 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -100,7 +100,7 @@ class logloss_spmd_test : public logloss_test<Param> {
         std::int64_t num_checks = 5;
 
         std::vector<ndarray<float_t, 1>> vecs_host(num_checks), vecs_gpu(num_checks);
-        daal_rng<float_t> rn_gen;
+        rng<float_t> rn_gen;
         for (std::int64_t ij = 0; ij < num_checks; ++ij) {
             daal_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index 36e20f03c11..56d7f8c5c23 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -43,9 +43,9 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         x_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
-        primitives::daal_rng<float_t> rn_gen;
+        primitives::rng<float_t> rn_gen;
         primitives::daal_engine eng(4014 + n_);
-        rn_gen.uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
+        rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index 777c0ee68e2..1c82e2c8ac9 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -133,11 +133,11 @@ void create_stable_matrix(sycl::queue& queue,
     ONEDAL_ASSERT(A.get_dimension(1) == n);
     auto J = ndarray<Float, 2>::empty(queue, { n, n }, sycl::usm::alloc::host);
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
-    primitives::daal_rng<Float> rn_gen;
+    primitives::rng<Float> rn_gen;
     primitives::daal_engine eng(2007 + n);
 
-    rn_gen.uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
-    rn_gen.uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
+    rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
+    rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
 
     // orthogonalize matrix J
     gram_schmidt(J);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index b24a59386c7..1358c1b8826 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<std::int32_t, 1>::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host);
         auto params_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
-        primitives::daal_rng<float_t> rn_gen;
+        primitives::rng<float_t> rn_gen;
         primitives::daal_engine eng(2007 + n);
-        rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
-        rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
+        rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
@@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         primitives::rng<float_t> rn_gen;
         primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
+        rn_gen.uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            rn_gen.uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
+            rn_gen.uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index b93729dbdf7..462ee2a3ada 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -146,8 +146,7 @@ class rng {
                      Type* dst,
                      onedal_engine<EngineType>& engine_,
                      const event_vector& deps = {});
-};
-
 #endif
+};
 
 }; // namespace oneapi::dal::backend::primitives
\ No newline at end of file

From 06f188580d3c89c789b66054f3464ecf2ac86194 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Mon, 16 Dec 2024 03:58:30 -0800
Subject: [PATCH 11/18] comments fixes

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.h    |   1 -
 .../engines/mrg32k3a/mrg32k3a_types.h         |   1 -
 .../engines/philox4x32x10/philox4x32x10.h     |   1 -
 .../philox4x32x10/philox4x32x10_types.h       |   1 -
 .../algorithms/engines/mrg32k3a/mrg32k3a.cpp  |   1 -
 .../mrg32k3a/mrg32k3a_batch_container.h       |   1 -
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    |   1 -
 .../mrg32k3a_dense_default_batch_fpt_cpu.cpp  |   1 -
 ...k3a_dense_default_batch_fpt_dispatcher.cpp |   1 -
 .../engines/mrg32k3a/mrg32k3a_impl.i          |   5 +-
 .../engines/mrg32k3a/mrg32k3a_kernel.h        |   5 +-
 .../engines/philox4x32x10/philox4x32x10.cpp   |   1 -
 .../philox4x32x10_batch_container.h           |   5 +-
 .../philox4x32x10/philox4x32x10_batch_impl.h  |   1 -
 ...lox4x32x10_dense_default_batch_fpt_cpu.cpp |   1 -
 ...x10_dense_default_batch_fpt_dispatcher.cpp |   1 -
 .../philox4x32x10/philox4x32x10_impl.i        |   5 +-
 .../philox4x32x10/philox4x32x10_kernel.h      |   5 +-
 cpp/daal/src/externals/service_rng.h          |  34 ++---
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../backend/gpu/train_kernel_hist_impl.hpp    |  16 +--
 .../gpu/train_kernel_hist_impl_dpc.cpp        |  16 +--
 .../algo/louvain/backend/cpu/louvain_data.hpp |   2 +-
 .../objective_function/test/fixture.hpp       |   2 +-
 .../objective_function/test/spmd_fixture.hpp  |   2 +-
 .../optimizers/test/cg_solver_dpc.cpp         |   2 +-
 .../primitives/optimizers/test/fixture.hpp    |   2 +-
 .../optimizers/test/newton_cg_dpc.cpp         |   2 +-
 .../rng/{engine_gpu.hpp => dpc_engine.hpp}    |  68 +++++-----
 .../rng/{engine_cpu.hpp => host_engine.hpp}   |  41 +++---
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp |  52 ++++----
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 118 +++++++++---------
 .../primitives/rng/rng_engine_collection.hpp  |  26 ++--
 .../dal/backend/primitives/rng/rng_types.hpp  |   2 +-
 .../backend/primitives/rng/test/rng_dpc.cpp   |  66 +++++-----
 .../daal/algorithms/engines/mrg32k3a.rst      |   3 +-
 36 files changed, 236 insertions(+), 258 deletions(-)
 rename cpp/oneapi/dal/backend/primitives/rng/{engine_gpu.hpp => dpc_engine.hpp} (62%)
 rename cpp/oneapi/dal/backend/primitives/rng/{engine_cpu.hpp => host_engine.hpp} (70%)

diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
index b794813a227..518d26e01f1 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -1,6 +1,5 @@
 /* file: mrg32k3a.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
index a6b9d699f77..8d697dfd72a 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_types.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
index ec82723f1f8..e57798be50a 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
@@ -1,6 +1,5 @@
 /* file: philox4x32x10.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
index 74d2e884670..778b81f4ec9 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_types.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
index fe015c85428..c550d81dec6 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
@@ -1,6 +1,5 @@
 /* file: mrg32k3a.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
index 31126c4300f..ce83f554026 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_batch_container.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
index 0ff55f39b62..469ec92a0ab 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_batch_impl.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
index 1d3820053bd..529c4af2635 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
index 1b3f3c618e9..fd78108df73 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
index 06d670f1f7a..f8f12b2deea 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_impl.i */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,8 +21,8 @@
 //--
 */
 
-#ifndef __mrg32k3a_IMPL_I__
-#define __mrg32k3a_IMPL_I__
+#ifndef __MRG32K3A_IMPL_I__
+#define __MRG32K3A_IMPL_I__
 
 namespace daal
 {
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
index 86b8d929aae..80c9fbe44d9 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
@@ -1,6 +1,5 @@
 /* file: mrg32k3a_kernel.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,8 +19,8 @@
 //  Declaration of a template function for calculating values using the MRG32k3a generator.
 //--
 
-#ifndef __mrg32k3a_KERNEL_H__
-#define __mrg32k3a_KERNEL_H__
+#ifndef __MRG32K3A_KERNEL_H__
+#define __MRG32K3A_KERNEL_H__
 
 #include "algorithms/engines/mrg32k3a/mrg32k3a.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
index c103a4ae068..47fb7dae70f 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
@@ -1,6 +1,5 @@
 /* file: philox4x32x10.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
index 7a721c4f1a8..9cb747e95a8 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_batch_container.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,8 +21,8 @@
 //--
 */
 
-#ifndef __philox4x32x10_BATCH_CONTAINER_H__
-#define __philox4x32x10_BATCH_CONTAINER_H__
+#ifndef __PHILOX4X32X10_BATCH_CONTAINER_H__
+#define __PHILOX4X32X10_BATCH_CONTAINER_H__
 
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
 #include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h"
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
index f6a9f35e268..58e28eb47bf 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_batch_impl.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
index e1ed7b4d896..946517c1d9c 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
index 1f79b94c762..1640fc4ec12 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
index 9e2dc9f6b99..5aa5addc22b 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_impl.i */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,8 +21,8 @@
 //--
 */
 
-#ifndef __philox4x32x10_IMPL_I__
-#define __philox4x32x10_IMPL_I__
+#ifndef __PHILOX4X32X10_IMPL_I__
+#define __PHILOX4X32X10_IMPL_I__
 
 namespace daal
 {
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
index 47333a6c78f..5870d781abd 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
@@ -1,6 +1,5 @@
 /* file: philox4x32x10_kernel.h */
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
 * Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,8 +19,8 @@
 //  Declaration of a template function for generating values using the Philox4x32-10 engine.
 //--
 
-#ifndef __philox4x32x10_KERNEL_H__
-#define __philox4x32x10_KERNEL_H__
+#ifndef __PHILOX4X32X10_KERNEL_H__
+#define __PHILOX4X32X10_KERNEL_H__
 
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h
index ba95f8f7324..d03c0f39abf 100644
--- a/cpp/daal/src/externals/service_rng.h
+++ b/cpp/daal/src/externals/service_rng.h
@@ -115,7 +115,7 @@ class RNGs
     int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b,
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
-        Type * buffer = (Type *)daal_malloc(sizeof(Type) * 1);
+        Type * buffer = (Type *)daal_malloc(sizeof(Type) * n);
         int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method);
         daal_free(buffer);
         return errorcode;
@@ -125,28 +125,20 @@ class RNGs
     int uniformWithoutReplacement(const SizeType n, DstType * r, Type * buffer, void * state, const Type a, const Type b,
                                   const int method = __DAAL_RNG_METHOD_UNIFORM_STD)
     {
-        int errorcode          = 0;
-        SizeType sequence_size = abs(b - a);
-        if (sequence_size < n)
-        {
-            return -1;
-        }
-        Type * buffer_ = (Type *)daal_malloc(sizeof(Type) * sequence_size);
-        for (SizeType i = 0; i < sequence_size; i++)
-        {
-            buffer_[i] = i;
-        }
-        Type swapIdx;
-        for (SizeType i = 0; i < n; i++)
-        {
-            errorcode = uniform(1, &swapIdx, state, i, sequence_size, method);
-            int index = int(swapIdx);
-
-            std::swap(buffer_[i], buffer_[index]);
-        }
+        int errorcode = 0;
         for (SizeType i = 0; i < n; i++)
         {
-            r[i] = buffer_[i];
+            errorcode = uniform(1, buffer + i, state, a + i, b, method);
+            int value = buffer[i];
+
+            for (SizeType j = i; j > 0; j--)
+            {
+                if (value == buffer[j - 1])
+                {
+                    value = (DstType)(j - 1 + a);
+                }
+            }
+            r[i] = value;
         }
         return errorcode;
     }
diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index c33575c472d..bdda9048082 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -90,7 +90,7 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
                                    const std::int64_t &samples_count = 1024) {
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
-    dal::backend::primitives::daal_engine eng;
+    dal::backend::primitives::host_engine eng;
     dal::backend::primitives::rng<std::int32_t> rn_gen;
     rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
index 3106eb537d7..ac04f73d89f 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -50,8 +50,8 @@ class train_kernel_hist_impl {
     using model_manager_t = train_model_manager<Float, Index, Task>;
     using train_context_t = train_context<Float, Index, Task>;
     using imp_data_t = impurity_data<Float, Index, Task>;
-    using rng_engine_t = pr::daal_engine<pr::engine_list::mt2203>;
-    using rng_engine_list_t = std::vector<rng_engine_t>;
+    using rng_engine_t = pr::host_engine<pr::engine_method::mt2203>;
+    using rng_engine_method_t = std::vector<rng_engine_t>;
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
     using node_t = node<Index>;
@@ -79,7 +79,7 @@ class train_kernel_hist_impl {
                                           Index class_count) const;
 
     sycl::event gen_initial_tree_order(train_context_t& ctx,
-                                       rng_engine_list_t& rng_engine_list,
+                                       rng_engine_method_t& rng_engine_method,
                                        pr::ndarray<Index, 1>& node_list,
                                        pr::ndarray<Index, 1>& tree_order_level,
                                        Index engine_offset,
@@ -115,12 +115,12 @@ class train_kernel_hist_impl {
     /// @param[in] ctx              a training context structure for a GPU backend
     /// @param[in] node_count       number of nodes on the current level
     /// @param[in] node_vs_tree_map an initial tree order
-    /// @param[in] rng_engine_list  a list of random generator engines
+    /// @param[in] rng_engine_method  a list of random generator engines
     std::tuple<pr::ndarray<Index, 1>, sycl::event> gen_feature_list(
         const train_context_t& ctx,
         Index node_count,
         const pr::ndarray<Index, 1>& node_vs_tree_map,
-        rng_engine_list_t& rng_engine_list);
+        rng_engine_method_t& rng_engine_method);
 
     /// Generates random thresholds for each node and for each selected feature for node.
     /// Thresholds are used for a random splitter kernel to split each node.
@@ -129,12 +129,12 @@ class train_kernel_hist_impl {
     /// @param[in] ctx              a training context structure for a GPU backend
     /// @param[in] node_count       number of nodes on the current level
     /// @param[in] node_vs_tree_map an initial tree order
-    /// @param[in] rng_engine_list  a list of random generator engines
+    /// @param[in] rng_engine_method  a list of random generator engines
     std::tuple<pr::ndarray<Float, 1>, sycl::event> gen_random_thresholds(
         const train_context_t& ctx,
         Index node_count,
         const pr::ndarray<Index, 1>& node_vs_tree_map,
-        rng_engine_list_t& rng_engine_list);
+        rng_engine_method_t& rng_engine_method);
 
     /// Computes initial impurity for each node.
     ///
@@ -575,7 +575,7 @@ class train_kernel_hist_impl {
                                 pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
                                 pr::ndarray<Float, 1>& var_imp,
                                 pr::ndarray<Float, 1>& var_imp_variance,
-                                const rng_engine_list_t& rng_engine_arr,
+                                const rng_engine_method_t& rng_engine_arr,
                                 Index tree_idx,
                                 Index tree_in_block,
                                 Index built_tree_count,
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index bdd9b82802d..c846c77a38c 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -368,7 +368,7 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::allocate_buffers(const tra
 template <typename Float, typename Bin, typename Index, typename Task>
 sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_order(
     train_context_t& ctx,
-    rng_engine_list_t& rng_engine_list,
+    rng_engine_method_t& rng_engine_method,
     pr::ndarray<Index, 1>& node_list_host,
     pr::ndarray<Index, 1>& tree_order_level,
     Index engine_offset,
@@ -401,7 +401,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
             rn_gen.uniform_cpu(ctx.selected_row_total_count_,
                                gen_row_idx_global_ptr,
-                               rng_engine_list[engine_offset + node_idx],
+                               rng_engine_method[engine_offset + node_idx],
                                0,
                                ctx.row_total_count_);
 
@@ -465,7 +465,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
     const train_context_t& ctx,
     Index node_count,
     const pr::ndarray<Index, 1>& node_vs_tree_map_list,
-    rng_engine_list_t& rng_engine_list) {
+    rng_engine_method_t& rng_engine_method) {
     ONEDAL_PROFILER_TASK(gen_feature_list, queue_);
 
     ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count);
@@ -491,7 +491,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
-                rng_engine_list[tree_map_ptr[node]],
+                rng_engine_method[tree_map_ptr[node]],
                 0,
                 ctx.column_count_);
         }
@@ -517,7 +517,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     const train_context_t& ctx,
     Index node_count,
     const pr::ndarray<Index, 1>& node_vs_tree_map,
-    rng_engine_list_t& rng_engine_list) {
+    rng_engine_method_t& rng_engine_method) {
     ONEDAL_PROFILER_TASK(gen_random_thresholds, queue_);
 
     ONEDAL_ASSERT(node_vs_tree_map.get_count() == node_count);
@@ -539,7 +539,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     for (Index node = 0; node < node_count; ++node) {
         rn_gen.uniform_cpu(ctx.selected_ftr_count_,
                            random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                           rng_engine_list[tree_map_ptr[node]],
+                           rng_engine_method[tree_map_ptr[node]],
                            0.0f,
                            1.0f);
     }
@@ -1613,7 +1613,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
     pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
     pr::ndarray<Float, 1>& var_imp,
     pr::ndarray<Float, 1>& var_imp_variance,
-    const rng_engine_list_t& engine_arr,
+    const rng_engine_method_t& engine_arr,
     Index tree_idx_in_block,
     Index tree_in_block_count,
     Index built_tree_count,
@@ -1859,7 +1859,7 @@ train_result<Task> train_kernel_hist_impl<Float, Bin, Index, Task>::operator()(
     de::check_mul_overflow<std::size_t>((ctx.tree_count_ - 1), skip_num);
 
     pr::engine_collection collection(ctx.tree_count_, desc.get_seed());
-    rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) {
+    rng_engine_method_t engine_arr = collection([&](std::size_t i, std::size_t& skip) {
         skip = i * skip_num;
     });
 
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index 98d4bf60047..bd5773ff093 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -123,7 +123,7 @@ struct louvain_data {
     // Total link weight in the network
     value_type m;
 
-    daal_engine<engine_list::mt2203> eng;
+    host_engine<engine_method::mt2203> eng;
     rng<std::int32_t> rn_gen;
 
     const std::int64_t vertex_count;
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index f16d5777182..03f751e570b 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
             ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host);
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
-            primitives::daal_engine eng(2007 + dim * num_checks + ij);
+            primitives::host_engine eng(2007 + dim * num_checks + ij);
             rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index f90aa3d8a87..985d863b1f5 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -102,7 +102,7 @@ class logloss_spmd_test : public logloss_test<Param> {
         std::vector<ndarray<float_t, 1>> vecs_host(num_checks), vecs_gpu(num_checks);
         rng<float_t> rn_gen;
         for (std::int64_t ij = 0; ij < num_checks; ++ij) {
-            daal_engine eng(2007 + dim * num_checks + ij);
+            host_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
                 (ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host));
             rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index 56d7f8c5c23..c912a3a99d2 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -44,7 +44,7 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         primitives::rng<float_t> rn_gen;
-        primitives::daal_engine eng(4014 + n_);
+        primitives::host_engine eng(4014 + n_);
         rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index 1c82e2c8ac9..120f65f61f0 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -134,7 +134,7 @@ void create_stable_matrix(sycl::queue& queue,
     auto J = ndarray<Float, 2>::empty(queue, { n, n }, sycl::usm::alloc::host);
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
     primitives::rng<Float> rn_gen;
-    primitives::daal_engine eng(2007 + n);
+    primitives::host_engine eng(2007 + n);
 
     rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
     rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index 1358c1b8826..d7414924bf6 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -57,7 +57,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto params_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
         primitives::rng<float_t> rn_gen;
-        primitives::daal_engine eng(2007 + n);
+        primitives::host_engine eng(2007 + n);
         rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
         rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
diff --git a/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
similarity index 62%
rename from cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp
rename to cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
index 242f71cea65..1f13975e8d6 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
@@ -19,79 +19,80 @@
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
 #include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
 #include <oneapi/mkl.hpp>
+
 namespace mkl = oneapi::mkl;
 namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-template <engine_list EngineType>
-struct onedal_engine_type;
+template <engine_method EngineType>
+struct dpc_engine_type;
 
 template <>
-struct onedal_engine_type<engine_list::mt2203> {
+struct dpc_engine_type<engine_method::mt2203> {
     using type = oneapi::mkl::rng::mt2203;
 };
 
 template <>
-struct onedal_engine_type<engine_list::mcg59> {
+struct dpc_engine_type<engine_method::mcg59> {
     using type = oneapi::mkl::rng::mcg59;
 };
 
 template <>
-struct onedal_engine_type<engine_list::mt19937> {
+struct dpc_engine_type<engine_method::mt19937> {
     using type = oneapi::mkl::rng::mt19937;
 };
 
 template <>
-struct onedal_engine_type<engine_list::mrg32k3a> {
+struct dpc_engine_type<engine_method::mrg32k3a> {
     using type = oneapi::mkl::rng::mrg32k3a;
 };
 
 template <>
-struct onedal_engine_type<engine_list::philox4x32x10> {
+struct dpc_engine_type<engine_method::philox4x32x10> {
     using type = oneapi::mkl::rng::philox4x32x10;
 };
 
-template <engine_list EngineType = engine_list::mt2203>
-class onedal_engine {
+template <engine_method EngineType = engine_method::mt2203>
+class dpc_engine {
 public:
-    using onedal_engine_t = typename onedal_engine_type<EngineType>::type;
+    using dpc_engine_t = typename dpc_engine_type<EngineType>::type;
 
-    explicit onedal_engine(sycl::queue& queue, std::int64_t seed = 777)
+    explicit dpc_engine(sycl::queue& queue, std::int64_t seed = 777)
             : q(queue),
-              daal_engine_(initialize_daal_engine(seed)),
-              onedal_engine_(initialize_onedal_engine(queue, seed)),
+              host_engine_(initialize_host_engine(seed)),
+              dpc_engine_(initialize_dpc_engine(queue, seed)),
               impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
-                  daal_engine_.get())) {
+                  host_engine_.get())) {
         if (!impl_) {
             throw std::domain_error("RNG engine is not supported");
         }
     }
 
-    virtual ~onedal_engine() = default;
+    virtual ~dpc_engine() = default;
 
-    void* get_cpu_engine_state() const {
+    void* get_host_engine_state() const {
         return impl_->getState();
     }
 
     auto& get_cpu_engine() {
-        return daal_engine_;
+        return host_engine_;
     }
 
     auto& get_gpu_engine() {
-        return onedal_engine_;
+        return dpc_engine_;
     }
 
     void skip_ahead_cpu(size_t nSkip) {
-        daal_engine_->skipAhead(nSkip);
+        host_engine_->skipAhead(nSkip);
     }
 
     void skip_ahead_gpu(size_t nSkip) {
         // Will be fixed in the next oneMKL release.
-        if constexpr (EngineType == engine_list::mt2203) {
+        if constexpr (EngineType == engine_method::mt2203) {
         }
         else {
-            skip_ahead(onedal_engine_, nSkip);
+            skip_ahead(dpc_engine_, nSkip);
         }
     }
 
@@ -100,35 +101,36 @@ class onedal_engine {
     }
 
 private:
-    daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
+    daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) {
         switch (EngineType) {
-            case engine_list::mt2203:
+            case engine_method::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
-            case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
-            case engine_list::mrg32k3a:
+            case engine_method::mcg59:
+                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_method::mrg32k3a:
                 return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
-            case engine_list::philox4x32x10:
+            case engine_method::philox4x32x10:
                 return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
-            case engine_list::mt19937:
+            case engine_method::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
         }
     }
 
-    onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) {
-        if constexpr (EngineType == engine_list::mt2203) {
-            return onedal_engine_t(
+    dpc_engine_t initialize_dpc_engine(sycl::queue& queue, std::int64_t seed) {
+        if constexpr (EngineType == engine_method::mt2203) {
+            return dpc_engine_t(
                 queue,
                 seed,
                 0); // Aligns CPU and GPU results for mt2203, impacts the performance.
         }
         else {
-            return onedal_engine_t(queue, seed);
+            return dpc_engine_t(queue, seed);
         }
     }
     sycl::queue q;
-    daal::algorithms::engines::EnginePtr daal_engine_;
-    onedal_engine_t onedal_engine_;
+    daal::algorithms::engines::EnginePtr host_engine_;
+    dpc_engine_t dpc_engine_;
     daal::algorithms::engines::internal::BatchBaseImpl* impl_;
 };
 
diff --git a/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
similarity index 70%
rename from cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp
rename to cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
index e8286f83051..436e032e608 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,27 +24,27 @@
 #include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
 namespace oneapi::dal::backend::primitives {
 
-template <engine_list EngineType = engine_list::mt2203>
-class daal_engine {
+template <engine_method EngineType = engine_method::mt2203>
+class host_engine {
 public:
-    explicit daal_engine(std::int64_t seed = 777)
-            : daal_engine_(initialize_daal_engine(seed)),
+    explicit host_engine(std::int64_t seed = 777)
+            : host_engine_(initialize_host_engine(seed)),
               impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
-                  daal_engine_.get())) {
+                  host_engine_.get())) {
         if (!impl_) {
             throw std::domain_error("RNG engine is not supported");
         }
     }
 
-    explicit daal_engine(const daal::algorithms::engines::EnginePtr& eng) : daal_engine_(eng) {
+    explicit host_engine(const daal::algorithms::engines::EnginePtr& eng) : host_engine_(eng) {
         impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
         if (!impl_) {
             throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
         }
     }
 
-    daal_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
-        daal_engine_ = eng;
+    host_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
+        host_engine_ = eng;
         impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
         if (!impl_) {
             throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
@@ -53,33 +53,34 @@ class daal_engine {
         return *this;
     }
 
-    virtual ~daal_engine() = default;
+    virtual ~host_engine() = default;
 
-    void* get_cpu_engine_state() const {
+    void* get_host_engine_state() const {
         return impl_->getState();
     }
 
-    auto& get_cpu_engine() {
-        return daal_engine_;
+    auto& get_host_engine() {
+        return host_engine_;
     }
 
 private:
-    daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) {
+    daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) {
         switch (EngineType) {
-            case engine_list::mt2203:
+            case engine_method::mt2203:
                 return daal::algorithms::engines::mt2203::Batch<>::create(seed);
-            case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed);
-            case engine_list::mrg32k3a:
+            case engine_method::mcg59:
+                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_method::mrg32k3a:
                 return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
-            case engine_list::philox4x32x10:
+            case engine_method::philox4x32x10:
                 return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
-            case engine_list::mt19937:
+            case engine_method::mt19937:
                 return daal::algorithms::engines::mt19937::Batch<>::create(seed);
             default: throw std::invalid_argument("Unsupported engine type");
         }
     }
 
-    daal::algorithms::engines::EnginePtr daal_engine_;
+    daal::algorithms::engines::EnginePtr host_engine_;
     daal::algorithms::engines::internal::BatchBaseImpl* impl_;
 };
 
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index 462ee2a3ada..cb235b9e4c3 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "oneapi/dal/backend/primitives/rng/engine_cpu.hpp"
+#include "oneapi/dal/backend/primitives/rng/host_engine.hpp"
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-#include "oneapi/dal/backend/primitives/rng/engine_gpu.hpp"
+#include "oneapi/dal/backend/primitives/rng/dpc_engine.hpp"
 
 #endif
 
@@ -31,33 +31,33 @@ class rng {
     rng() = default;
     ~rng() = default;
 
-    template <engine_list EngineType>
-    void uniform_cpu(Size count, Type* dst, daal_engine<EngineType> daal_engine, Type a, Type b) {
-        auto state = daal_engine.get_cpu_engine_state();
+    template <engine_method EngineType>
+    void uniform_cpu(Size count, Type* dst, host_engine<EngineType> host_engine, Type a, Type b) {
+        auto state = host_engine.get_host_engine_state();
         uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
     }
 
 #ifdef ONEDAL_DATA_PARALLEL
-    template <engine_list EngineType>
-    void uniform_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_, Type a, Type b) {
+    template <engine_method EngineType>
+    void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
         if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
             sycl::usm::alloc::device) {
             throw domain_error(dal::detail::error_messages::unsupported_data_type());
         }
-        auto state = engine_.get_cpu_engine_state();
+        auto state = engine_.get_host_engine_state();
         engine_.skip_ahead_gpu(count);
         uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
     }
 #endif
 
-    template <engine_list EngineType>
+    template <engine_method EngineType>
     void uniform_without_replacement_cpu(Size count,
                                          Type* dst,
                                          Type* buffer,
-                                         daal_engine<EngineType> daal_engine,
+                                         host_engine<EngineType> host_engine,
                                          Type a,
                                          Type b) {
-        auto state = daal_engine.get_cpu_engine_state();
+        auto state = host_engine.get_host_engine_state();
         uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
                                                                      dst,
                                                                      buffer,
@@ -66,18 +66,18 @@ class rng {
                                                                      b);
     }
 #ifdef ONEDAL_DATA_PARALLEL
-    template <engine_list EngineType>
+    template <engine_method EngineType>
     void uniform_without_replacement_cpu(Size count,
                                          Type* dst,
                                          Type* buffer,
-                                         onedal_engine<EngineType>& engine_,
+                                         dpc_engine<EngineType>& engine_,
                                          Type a,
                                          Type b) {
         if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
             sycl::usm::alloc::device) {
             throw domain_error(dal::detail::error_messages::unsupported_data_type());
         }
-        void* state = engine_.get_cpu_engine_state();
+        void* state = engine_.get_host_engine_state();
         engine_.skip_ahead_gpu(count);
         uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
                                                                      dst,
@@ -88,12 +88,12 @@ class rng {
     }
 #endif
 
-    template <engine_list EngineType,
+    template <engine_method EngineType,
               typename T = Type,
               typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, daal_engine<EngineType> daal_engine) {
+    void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
         Type idx[2];
-        auto state = daal_engine.get_cpu_engine_state();
+        auto state = host_engine.get_host_engine_state();
         for (Size i = 0; i < count; ++i) {
             uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
             std::swap(dst[idx[0]], dst[idx[1]]);
@@ -101,16 +101,16 @@ class rng {
     }
 
 #ifdef ONEDAL_DATA_PARALLEL
-    template <engine_list EngineType,
+    template <engine_method EngineType,
               typename T = Type,
               typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, onedal_engine<EngineType>& engine_) {
+    void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
         Type idx[2];
         if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
             sycl::usm::alloc::device) {
             throw domain_error(dal::detail::error_messages::unsupported_data_type());
         }
-        void* state = engine_.get_cpu_engine_state();
+        void* state = engine_.get_host_engine_state();
         engine_.skip_ahead_gpu(count);
 
         for (Size i = 0; i < count; ++i) {
@@ -121,30 +121,30 @@ class rng {
 #endif
 
 #ifdef ONEDAL_DATA_PARALLEL
-    template <engine_list EngineType>
+    template <engine_method EngineType>
     void uniform_gpu(sycl::queue& queue,
                      Size count,
                      Type* dst,
-                     onedal_engine<EngineType>& engine_,
+                     dpc_engine<EngineType>& engine_,
                      Type a,
                      Type b,
                      const event_vector& deps = {});
 
-    template <engine_list EngineType>
+    template <engine_method EngineType>
     void uniform_without_replacement_gpu(sycl::queue& queue,
                                          Size count,
                                          Type* dst,
                                          Type* buffer,
-                                         onedal_engine<EngineType>& engine_,
+                                         dpc_engine<EngineType>& engine_,
                                          Type a,
                                          Type b,
                                          const event_vector& deps = {});
 
-    template <engine_list EngineType>
+    template <engine_method EngineType>
     void shuffle_gpu(sycl::queue& queue,
                      Size count,
                      Type* dst,
-                     onedal_engine<EngineType>& engine_,
+                     dpc_engine<EngineType>& engine_,
                      const event_vector& deps = {});
 #endif
 };
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 2e3a0c962c8..cb550a50775 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -23,11 +23,11 @@ namespace oneapi::dal::backend::primitives {
 namespace bk = oneapi::dal::backend;
 
 template <typename Type, typename Size>
-template <engine_list EngineType>
+template <engine_method EngineType>
 void rng<Type, Size>::uniform_gpu(sycl::queue& queue,
                                   Size count,
                                   Type* dst,
-                                  onedal_engine<EngineType>& engine_,
+                                  dpc_engine<EngineType>& engine_,
                                   Type a,
                                   Type b,
                                   const event_vector& deps) {
@@ -42,12 +42,12 @@ void rng<Type, Size>::uniform_gpu(sycl::queue& queue,
 
 //Currently only CPU impl
 template <typename Type, typename Size>
-template <engine_list EngineType>
+template <engine_method EngineType>
 void rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
                                                       Size count,
                                                       Type* dst,
                                                       Type* buffer,
-                                                      onedal_engine<EngineType>& engine_,
+                                                      dpc_engine<EngineType>& engine_,
                                                       Type a,
                                                       Type b,
                                                       const event_vector& deps) {
@@ -55,25 +55,25 @@ void rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
-    void* state = engine_.get_cpu_engine_state();
+    void* state = engine_.get_host_engine_state();
     engine_.skip_ahead_gpu(count);
     uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
 }
 
 //Currently only CPU impl
 template <typename Type, typename Size>
-template <engine_list EngineType>
+template <engine_method EngineType>
 void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
                                   Size count,
                                   Type* dst,
-                                  onedal_engine<EngineType>& engine_,
+                                  dpc_engine<EngineType>& engine_,
                                   const event_vector& deps) {
     Type idx[2];
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
-    void* state = engine_.get_cpu_engine_state();
+    void* state = engine_.get_host_engine_state();
     engine_.skip_ahead_gpu(count);
 
     for (Size i = 0; i < count; ++i) {
@@ -82,31 +82,31 @@ void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
     }
 }
 
-#define INSTANTIATE_(F, Size, EngineType)                                                     \
-    template ONEDAL_EXPORT void rng<F, Size>::uniform_gpu(sycl::queue& queue,                 \
-                                                          Size count_,                        \
-                                                          F* dst,                             \
-                                                          onedal_engine<EngineType>& engine_, \
-                                                          F a,                                \
-                                                          F b,                                \
+#define INSTANTIATE_(F, Size, EngineType)                                                  \
+    template ONEDAL_EXPORT void rng<F, Size>::uniform_gpu(sycl::queue& queue,              \
+                                                          Size count_,                     \
+                                                          F* dst,                          \
+                                                          dpc_engine<EngineType>& engine_, \
+                                                          F a,                             \
+                                                          F b,                             \
                                                           const event_vector& deps);
 
-#define INSTANTIATE_FLOAT_(Size)                           \
-    INSTANTIATE_(float, Size, engine_list::mt2203)         \
-    INSTANTIATE_(float, Size, engine_list::mcg59)          \
-    INSTANTIATE_(float, Size, engine_list::mrg32k3a)       \
-    INSTANTIATE_(float, Size, engine_list::philox4x32x10)  \
-    INSTANTIATE_(float, Size, engine_list::mt19937)        \
-    INSTANTIATE_(double, Size, engine_list::mt2203)        \
-    INSTANTIATE_(double, Size, engine_list::mcg59)         \
-    INSTANTIATE_(double, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_(double, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_(double, Size, engine_list::mt19937)       \
-    INSTANTIATE_(int, Size, engine_list::mt2203)           \
-    INSTANTIATE_(int, Size, engine_list::mcg59)            \
-    INSTANTIATE_(int, Size, engine_list::mrg32k3a)         \
-    INSTANTIATE_(int, Size, engine_list::philox4x32x10)    \
-    INSTANTIATE_(int, Size, engine_list::mt19937)
+#define INSTANTIATE_FLOAT_(Size)                             \
+    INSTANTIATE_(float, Size, engine_method::mt2203)         \
+    INSTANTIATE_(float, Size, engine_method::mcg59)          \
+    INSTANTIATE_(float, Size, engine_method::mrg32k3a)       \
+    INSTANTIATE_(float, Size, engine_method::philox4x32x10)  \
+    INSTANTIATE_(float, Size, engine_method::mt19937)        \
+    INSTANTIATE_(double, Size, engine_method::mt2203)        \
+    INSTANTIATE_(double, Size, engine_method::mcg59)         \
+    INSTANTIATE_(double, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_(double, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_(double, Size, engine_method::mt19937)       \
+    INSTANTIATE_(int, Size, engine_method::mt2203)           \
+    INSTANTIATE_(int, Size, engine_method::mcg59)            \
+    INSTANTIATE_(int, Size, engine_method::mrg32k3a)         \
+    INSTANTIATE_(int, Size, engine_method::philox4x32x10)    \
+    INSTANTIATE_(int, Size, engine_method::mt19937)
 
 INSTANTIATE_FLOAT_(std::int64_t);
 INSTANTIATE_FLOAT_(std::int32_t);
@@ -117,44 +117,44 @@ INSTANTIATE_FLOAT_(std::int32_t);
         Size count_,                                                           \
         F* dst,                                                                \
         F* buff,                                                               \
-        onedal_engine<EngineType>& engine_,                                    \
+        dpc_engine<EngineType>& engine_,                                       \
         F a,                                                                   \
         F b,                                                                   \
         const event_vector& deps);
 
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                           \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59)          \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a)       \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10)  \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937)       \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203)           \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59)            \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10)    \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937)
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                             \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59)          \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a)       \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10)  \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937)       \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt2203)           \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mcg59)            \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mrg32k3a)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::philox4x32x10)    \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
 
-#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                                              \
-    template ONEDAL_EXPORT void rng<F, Size>::shuffle_gpu(sycl::queue& queue,                 \
-                                                          Size count_,                        \
-                                                          F* dst,                             \
-                                                          onedal_engine<EngineType>& engine_, \
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                                           \
+    template ONEDAL_EXPORT void rng<F, Size>::shuffle_gpu(sycl::queue& queue,              \
+                                                          Size count_,                     \
+                                                          F* dst,                          \
+                                                          dpc_engine<EngineType>& engine_, \
                                                           const event_vector& deps);
 
-#define INSTANTIATE_SHUFFLE_FLOAT(Size)                        \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203)        \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59)         \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a)      \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \
-    INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937)
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                          \
+    INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203)        \
+    INSTANTIATE_SHUFFLE(int, Size, engine_method::mcg59)         \
+    INSTANTIATE_SHUFFLE(int, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_SHUFFLE(int, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_SHUFFLE(int, Size, engine_method::mt19937)
 
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
 INSTANTIATE_SHUFFLE_FLOAT(std::int32_t);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 76c56f61f7c..e7e19f64c4d 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -36,10 +36,10 @@ class engine_collection {
               engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)),
               params_(count),
               technique_(daal::algorithms::engines::internal::family),
-              daal_engine_list_(count) {}
+              host_engine_method_(count) {}
 
     template <typename Op>
-    std::vector<daal_engine<engine_list::mt2203>> operator()(Op&& op) {
+    std::vector<host_engine<engine_method::mt2203>> operator()(Op&& op) {
         daal::services::Status status;
         for (Size i = 0; i < count_; ++i) {
             op(i, params_.nSkip[i]);
@@ -49,25 +49,25 @@ class engine_collection {
             engine_,
             technique_,
             params_,
-            daal_engine_list_,
+            host_engine_method_,
             &status);
         if (!status) {
             dal::backend::interop::status_to_exception(status);
         }
 
-        std::vector<daal_engine<engine_list::mt2203>> engine_list(count_);
+        std::vector<host_engine<engine_method::mt2203>> engine_method(count_);
         for (Size i = 0; i < count_; ++i) {
-            engine_list[i] = daal_engine_list_[i];
+            engine_method[i] = host_engine_method_[i];
         }
 
         //copy elision
-        return engine_list;
+        return engine_method;
     }
 
 private:
     void select_parallelization_technique(
         daal::algorithms::engines::internal::ParallelizationTechnique& technique) {
-        auto daal_engine_impl =
+        auto host_engine_impl =
             dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
 
         daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = {
@@ -77,7 +77,7 @@ class engine_collection {
         };
 
         for (auto& techn : techniques) {
-            if (daal_engine_impl->hasSupport(techn)) {
+            if (host_engine_impl->hasSupport(techn)) {
                 technique = techn;
                 return;
             }
@@ -93,10 +93,10 @@ class engine_collection {
     daal::algorithms::engines::internal::Params<daal::sse2> params_;
     daal::algorithms::engines::internal::ParallelizationTechnique technique_;
     daal::services::internal::TArray<daal::algorithms::engines::EnginePtr, daal::sse2>
-        daal_engine_list_;
+        host_engine_method_;
 };
 
-template <typename Size = std::int64_t, engine_list EngineType = engine_list::mt2203>
+template <typename Size = std::int64_t, engine_method EngineType = engine_method::mt2203>
 class engine_collection_oneapi {
 public:
     engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777)
@@ -104,18 +104,18 @@ class engine_collection_oneapi {
               seed_(seed) {
         engines_.reserve(count_);
         for (Size i = 0; i < count_; ++i) {
-            engines_.push_back(onedal_engine<EngineType>(queue, seed_));
+            engines_.push_back(dpc_engine<EngineType>(queue, seed_));
         }
     }
 
-    std::vector<onedal_engine<EngineType>> get_engines() const {
+    std::vector<dpc_engine<EngineType>> get_engines() const {
         return engines_;
     }
 
 private:
     Size count_;
     std::int64_t seed_;
-    std::vector<onedal_engine<EngineType>> engines_;
+    std::vector<dpc_engine<EngineType>> engines_;
 };
 
 #endif
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
index d502e9282ee..4132fbe557a 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
@@ -24,6 +24,6 @@
 
 namespace oneapi::dal::backend::primitives {
 
-enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
+enum class engine_method { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
 
 }
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 411894bdad4..244ac91c72e 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -35,27 +35,27 @@ struct engine_map {};
 
 template <>
 struct engine_map<mt2203> {
-    constexpr static auto value = engine_list::mt2203;
+    constexpr static auto value = engine_method::mt2203;
 };
 
 template <>
 struct engine_map<mcg59> {
-    constexpr static auto value = engine_list::mcg59;
+    constexpr static auto value = engine_method::mcg59;
 };
 
 template <>
 struct engine_map<mrg32k3a> {
-    constexpr static auto value = engine_list::mrg32k3a;
+    constexpr static auto value = engine_method::mrg32k3a;
 };
 
 template <>
 struct engine_map<philox4x32x10> {
-    constexpr static auto value = engine_list::philox4x32x10;
+    constexpr static auto value = engine_method::philox4x32x10;
 };
 
 template <>
 struct engine_map<mt19937> {
-    constexpr static auto value = engine_list::mt19937;
+    constexpr static auto value = engine_method::mt19937;
 };
 
 template <typename engine_type>
@@ -73,13 +73,13 @@ class rng_test : public te::policy_fixture {
         return rn_gen;
     }
 
-    auto get_daal_engine(std::int64_t seed) {
-        auto rng_engine = daal_engine<engine_qq>(seed);
+    auto get_host_engine(std::int64_t seed) {
+        auto rng_engine = host_engine<engine_qq>(seed);
         return rng_engine;
     }
 
     auto get_engine(std::int64_t seed) {
-        auto rng_engine = onedal_engine<engine_qq>(this->get_queue(), seed);
+        auto rng_engine = dpc_engine<engine_qq>(this->get_queue(), seed);
         return rng_engine;
     }
 
@@ -134,29 +134,29 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
 using rng_types_skip_ahead_support = COMBINE_TYPES((float),
                                                    (mt19937, mcg59, mrg32k3a, philox4x32x10));
 
-//Just for perf tests
-TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) {
-    SKIP_IF(this->get_policy().is_cpu());
-    std::int64_t elem_count = GENERATE_COPY(10000);
-    std::int64_t seed = GENERATE_COPY(777);
-
-    auto arr_host = this->allocate_array_host(elem_count);
-    auto arr_host_ptr_ = arr_host.get_mutable_data();
-
-    auto arr_host_fake = this->allocate_array_host(1);
-    auto arr_host_ptr_fake = arr_host_fake.get_mutable_data();
-    auto rn_gen_ = this->get_rng();
-    auto rng_engine_1 = this->get_engine(seed);
-
-    BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
-        rn_gen_.uniform_without_replacement_cpu(elem_count,
-                                                arr_host_ptr_,
-                                                arr_host_ptr_fake,
-                                                rng_engine_1,
-                                                0,
-                                                elem_count);
-    };
-}
+// //Just for perf tests
+// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) {
+//     SKIP_IF(this->get_policy().is_cpu());
+//     std::int64_t elem_count = GENERATE_COPY(10000);
+//     std::int64_t seed = GENERATE_COPY(777);
+
+//     auto arr_host = this->allocate_array_host(elem_count);
+//     auto arr_host_ptr_ = arr_host.get_mutable_data();
+
+//     auto arr_host_fake = this->allocate_array_host(1);
+//     auto arr_host_ptr_fake = arr_host_fake.get_mutable_data();
+//     auto rn_gen_ = this->get_rng();
+//     auto rng_engine_1 = this->get_engine(seed);
+
+//     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
+//         rn_gen_.uniform_without_replacement_cpu(elem_count,
+//                                                 arr_host_ptr_,
+//                                                 arr_host_ptr_fake,
+//                                                 rng_engine_1,
+//                                                 0,
+//                                                 elem_count);
+//     };
+// }
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
@@ -228,13 +228,13 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
     this->check_results(arr_gpu, arr_host);
 }
 
-//TODO: add engine collection test + daal_engine tests
+//TODO: add engine collection test + host_engine tests
 // TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) {
 //     SKIP_IF(this->get_policy().is_cpu());
 //     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
 //     std::int64_t seed = GENERATE_COPY(1, 777, 999);
 
-//     engine_collection<std::int64_t,engine_list::mcg59> collection(this->get_queue(), 2, seed);
+//     engine_collection<std::int64_t,engine_method::mcg59> collection(this->get_queue(), 2, seed);
 
 //     auto engine_arr = collection.get_engines();
 
diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst
index ce8ca0ec0cc..1da1fd7bc15 100644
--- a/docs/source/daal/algorithms/engines/mrg32k3a.rst
+++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst
@@ -17,7 +17,8 @@
 mrg32k3a
 ========
 
-The engine is based on the 59-bit multiplicative congruential generator.
+The engine based on a 32-bit combined multiple recursive generator
+with two components of order 3, optimized for batch processing.
 
 .. rubric:: Subsequence selection methods support
 

From 72755dbd3767d62b320f5ba89b0e86b64068358b Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Mon, 16 Dec 2024 08:38:17 -0800
Subject: [PATCH 12/18] minor fixes

---
 .../algorithms/engines/mrg32k3a/mrg32k3a.h    |   2 +-
 .../engines/mrg32k3a/mrg32k3a_types.h         |   2 +-
 .../engines/philox4x32x10/philox4x32x10.h     |   2 +-
 .../philox4x32x10/philox4x32x10_types.h       |   2 +-
 .../vertex_partitioning_default_kernel.hpp    |   7 +-
 .../gpu/train_kernel_hist_impl_dpc.cpp        |  33 ++-
 .../algo/louvain/backend/cpu/louvain_data.hpp |   1 -
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../objective_function/test/fixture.hpp       |   3 +-
 .../objective_function/test/spmd_fixture.hpp  |   4 +-
 .../optimizers/test/cg_solver_dpc.cpp         |   3 +-
 .../primitives/optimizers/test/fixture.hpp    |   5 +-
 .../optimizers/test/newton_cg_dpc.cpp         |  12 +-
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 212 ++++++++----------
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  96 ++++----
 .../backend/primitives/rng/test/rng_dpc.cpp   |  30 +--
 .../daal/algorithms/engines/mrg32k3a.rst      |   2 +-
 17 files changed, 191 insertions(+), 227 deletions(-)

diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
index 518d26e01f1..a70c1853e1a 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -17,7 +17,7 @@
 
 /*
 //++
-//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
 //  with two components of order 3, optimized for batch processing.
 //--
 */
diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
index 8d697dfd72a..8fdc58b98c8 100644
--- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
@@ -17,7 +17,7 @@
 
 /*
 //++
-//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator 
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
 //  with two components of order 3, optimized for batch processing.
 //--
 */
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
index e57798be50a..3a5d0e33180 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
@@ -17,7 +17,7 @@
 
 /*
 //++
-//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
 //  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 */
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
index 778b81f4ec9..0c0a92c9b3a 100644
--- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
@@ -17,7 +17,7 @@
 
 /*
 //++
-//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) 
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
 //  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
 //--
 */
diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index bdda9048082..439fa4665d5 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -91,8 +91,11 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
     dal::backend::primitives::host_engine eng;
-    dal::backend::primitives::rng<std::int32_t> rn_gen;
-    rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count);
+    dal::backend::primitives::uniform_cpu<std::int32_t>(samples_count,
+                                                        rnd_vertex_ids,
+                                                        eng,
+                                                        0,
+                                                        vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index c846c77a38c..cd6659d9814 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -396,14 +396,13 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
         Index* const node_list_ptr = node_list_host.get_mutable_data();
 
         for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-            pr::rng<Index> rn_gen;
             Index* gen_row_idx_global_ptr =
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
-            rn_gen.uniform_cpu(ctx.selected_row_total_count_,
-                               gen_row_idx_global_ptr,
-                               rng_engine_method[engine_offset + node_idx],
-                               0,
-                               ctx.row_total_count_);
+            pr::uniform_cpu<Index>(ctx.selected_row_total_count_,
+                                   gen_row_idx_global_ptr,
+                                   rng_engine_method[engine_offset + node_idx],
+                                   0,
+                                   ctx.row_total_count_);
 
             if (ctx.distr_mode_) {
                 Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
@@ -483,11 +482,10 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_);
 
-    pr::rng<Index> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
     if (ctx.selected_ftr_count_ != ctx.column_count_) {
         for (Index node = 0; node < node_count; ++node) {
-            rn_gen.uniform_without_replacement_cpu(
+            pr::uniform_without_replacement_cpu<Index>(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
@@ -524,7 +522,6 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_);
 
-    pr::rng<Float> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
 
     // Create arrays for random generated bins
@@ -537,11 +534,11 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     // Generate random bins for selected features
     for (Index node = 0; node < node_count; ++node) {
-        rn_gen.uniform_cpu(ctx.selected_ftr_count_,
-                           random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                           rng_engine_method[tree_map_ptr[node]],
-                           0.0f,
-                           1.0f);
+        pr::uniform_cpu<Float>(ctx.selected_ftr_count_,
+                               random_bins_host_ptr + node * ctx.selected_ftr_count_,
+                               rng_engine_method[tree_map_ptr[node]],
+                               0.0f,
+                               1.0f);
     }
     auto event_rnd_generate =
         random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count());
@@ -1660,12 +1657,10 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
 
             const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1);
 
-            pr::rng<Index> rn_gen;
-
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                rn_gen.shuffle_cpu(oob_row_count,
-                                   permutation_ptr,
-                                   engine_arr[built_tree_count + tree_idx_in_block]);
+                pr::shuffle_cpu<Index>(oob_row_count,
+                                       permutation_ptr,
+                                       engine_arr[built_tree_count + tree_idx_in_block]);
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index bd5773ff093..ecd49784378 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -124,7 +124,6 @@ struct louvain_data {
     value_type m;
 
     host_engine<engine_method::mt2203> eng;
-    rng<std::int32_t> rn_gen;
 
     const std::int64_t vertex_count;
     const std::int64_t edge_count;
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index e758d769a01..70ceb84ac6e 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    ld.rn_gen.uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
+    uniform_cpu<std::int32_t>(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index 03f751e570b..6a1247a67c4 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -572,13 +572,12 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
         const std::int64_t p = hessian_host.get_dimension(0) - 1;
         const std::int64_t dim = fit_intercept ? p + 1 : p;
 
-        primitives::rng<float_t> rn_gen;
         auto vec_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host);
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
             primitives::host_engine eng(2007 + dim * num_checks + ij);
-            rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
+            pr::uniform_cpu<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index 985d863b1f5..cf3a2426dd6 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test<Param> {
         std::int64_t num_checks = 5;
 
         std::vector<ndarray<float_t, 1>> vecs_host(num_checks), vecs_gpu(num_checks);
-        rng<float_t> rn_gen;
+
         for (std::int64_t ij = 0; ij < num_checks; ++ij) {
             host_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
                 (ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host));
-            rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
+            uniform_cpu<float_t>(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
             vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue());
         }
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index c912a3a99d2..27af73de1e9 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -43,9 +43,8 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         x_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
-        primitives::rng<float_t> rn_gen;
         primitives::host_engine eng(4014 + n_);
-        rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
+        primitives::uniform_cpu<float_t>(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index 120f65f61f0..e941c971302 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -133,11 +133,10 @@ void create_stable_matrix(sycl::queue& queue,
     ONEDAL_ASSERT(A.get_dimension(1) == n);
     auto J = ndarray<Float, 2>::empty(queue, { n, n }, sycl::usm::alloc::host);
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
-    primitives::rng<Float> rn_gen;
     primitives::host_engine eng(2007 + n);
 
-    rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
-    rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
+    primitives::uniform_cpu<Float>(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
+    primitives::uniform_cpu<Float>(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
 
     // orthogonalize matrix J
     gram_schmidt(J);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index d7414924bf6..a4c0c1ebed3 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<std::int32_t, 1>::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host);
         auto params_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
-        primitives::rng<float_t> rn_gen;
+
         primitives::host_engine eng(2007 + n);
-        rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
-        rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
+        primitives::rnguniform_cpu<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        primitives::rnguniform_cpu<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
@@ -142,9 +142,9 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<float_t, 2>::empty(this->get_queue(), { n_, n_ }, sycl::usm::alloc::host);
         solution_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
-        primitives::rng<float_t> rn_gen;
+
         primitives::engine eng(4014 + n_);
-        rn_gen.uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
+        uniform_cpu<float_t>(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            rn_gen.uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
+            uniform_cpu<float_t>(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index cb235b9e4c3..b935d338ff9 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -25,128 +25,110 @@
 #endif
 
 namespace oneapi::dal::backend::primitives {
-template <typename Type, typename Size = std::int64_t>
-class rng {
-public:
-    rng() = default;
-    ~rng() = default;
-
-    template <engine_method EngineType>
-    void uniform_cpu(Size count, Type* dst, host_engine<EngineType> host_engine, Type a, Type b) {
-        auto state = host_engine.get_host_engine_state();
-        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
-    }
 
-#ifdef ONEDAL_DATA_PARALLEL
-    template <engine_method EngineType>
-    void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
-        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
-            sycl::usm::alloc::device) {
-            throw domain_error(dal::detail::error_messages::unsupported_data_type());
-        }
-        auto state = engine_.get_host_engine_state();
-        engine_.skip_ahead_gpu(count);
-        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_cpu(Size count, Type* dst, host_engine<EngineType>& host_engine, Type a, Type b) {
+    auto state = host_engine.get_host_engine_state();
+    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+}
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement_cpu(Size count,
+                                     Type* dst,
+                                     Type* buffer,
+                                     host_engine<EngineType> host_engine,
+                                     Type a,
+                                     Type b) {
+    auto state = host_engine.get_host_engine_state();
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+}
+
+template <typename Type,
+          typename Size,
+          engine_method EngineType,
+          typename T = Type,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
+void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
+    Type idx[2];
+    auto state = host_engine.get_host_engine_state();
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
     }
-#endif
+}
 
-    template <engine_method EngineType>
-    void uniform_without_replacement_cpu(Size count,
-                                         Type* dst,
-                                         Type* buffer,
-                                         host_engine<EngineType> host_engine,
-                                         Type a,
-                                         Type b) {
-        auto state = host_engine.get_host_engine_state();
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
-    }
 #ifdef ONEDAL_DATA_PARALLEL
-    template <engine_method EngineType>
-    void uniform_without_replacement_cpu(Size count,
-                                         Type* dst,
-                                         Type* buffer,
-                                         dpc_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b) {
-        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
-            sycl::usm::alloc::device) {
-            throw domain_error(dal::detail::error_messages::unsupported_data_type());
-        }
-        void* state = engine_.get_host_engine_state();
-        engine_.skip_ahead_gpu(count);
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
-#endif
-
-    template <engine_method EngineType,
-              typename T = Type,
-              typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
-        Type idx[2];
-        auto state = host_engine.get_host_engine_state();
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
+    auto state = engine_.get_host_engine_state();
+    engine_.skip_ahead_gpu(count);
+    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+}
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement_cpu(Size count,
+                                     Type* dst,
+                                     Type* buffer,
+                                     dpc_engine<EngineType>& engine_,
+                                     Type a,
+                                     Type b) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
-
-#ifdef ONEDAL_DATA_PARALLEL
-    template <engine_method EngineType,
-              typename T = Type,
-              typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
-        Type idx[2];
-        if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
-            sycl::usm::alloc::device) {
-            throw domain_error(dal::detail::error_messages::unsupported_data_type());
-        }
-        void* state = engine_.get_host_engine_state();
-        engine_.skip_ahead_gpu(count);
-
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
+    void* state = engine_.get_host_engine_state();
+    engine_.skip_ahead_gpu(count);
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+}
+
+template <typename Type,
+          typename Size,
+          engine_method EngineType,
+          typename T = Type,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
+void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
+    Type idx[2];
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
-#endif
+    void* state = engine_.get_host_engine_state();
+    engine_.skip_ahead_gpu(count);
 
-#ifdef ONEDAL_DATA_PARALLEL
-    template <engine_method EngineType>
-    void uniform_gpu(sycl::queue& queue,
-                     Size count,
-                     Type* dst,
-                     dpc_engine<EngineType>& engine_,
-                     Type a,
-                     Type b,
-                     const event_vector& deps = {});
-
-    template <engine_method EngineType>
-    void uniform_without_replacement_gpu(sycl::queue& queue,
-                                         Size count,
-                                         Type* dst,
-                                         Type* buffer,
-                                         dpc_engine<EngineType>& engine_,
-                                         Type a,
-                                         Type b,
-                                         const event_vector& deps = {});
-
-    template <engine_method EngineType>
-    void shuffle_gpu(sycl::queue& queue,
-                     Size count,
-                     Type* dst,
-                     dpc_engine<EngineType>& engine_,
-                     const event_vector& deps = {});
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
+    }
+}
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_gpu(sycl::queue& queue,
+                 Size count,
+                 Type* dst,
+                 dpc_engine<EngineType>& engine_,
+                 Type a,
+                 Type b,
+                 const event_vector& deps = {});
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement_gpu(sycl::queue& queue,
+                                     Size count,
+                                     Type* dst,
+                                     Type* buffer,
+                                     dpc_engine<EngineType>& engine_,
+                                     Type a,
+                                     Type b,
+                                     const event_vector& deps = {});
+
+template <typename Type, typename Size, engine_method EngineType>
+void shuffle_gpu(sycl::queue& queue,
+                 Size count,
+                 Type* dst,
+                 dpc_engine<EngineType>& engine_,
+                 const event_vector& deps = {});
 #endif
-};
 
-}; // namespace oneapi::dal::backend::primitives
\ No newline at end of file
+}; // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index cb550a50775..ec586ec1697 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -22,15 +22,14 @@ namespace oneapi::dal::backend::primitives {
 
 namespace bk = oneapi::dal::backend;
 
-template <typename Type, typename Size>
-template <engine_method EngineType>
-void rng<Type, Size>::uniform_gpu(sycl::queue& queue,
-                                  Size count,
-                                  Type* dst,
-                                  dpc_engine<EngineType>& engine_,
-                                  Type a,
-                                  Type b,
-                                  const event_vector& deps) {
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_gpu(sycl::queue& queue,
+                 Size count,
+                 Type* dst,
+                 dpc_engine<EngineType>& engine_,
+                 Type a,
+                 Type b,
+                 const event_vector& deps) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
@@ -41,16 +40,15 @@ void rng<Type, Size>::uniform_gpu(sycl::queue& queue,
 }
 
 //Currently only CPU impl
-template <typename Type, typename Size>
-template <engine_method EngineType>
-void rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
-                                                      Size count,
-                                                      Type* dst,
-                                                      Type* buffer,
-                                                      dpc_engine<EngineType>& engine_,
-                                                      Type a,
-                                                      Type b,
-                                                      const event_vector& deps) {
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement_gpu(sycl::queue& queue,
+                                     Size count,
+                                     Type* dst,
+                                     Type* buffer,
+                                     dpc_engine<EngineType>& engine_,
+                                     Type a,
+                                     Type b,
+                                     const event_vector& deps) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
@@ -61,13 +59,12 @@ void rng<Type, Size>::uniform_without_replacement_gpu(sycl::queue& queue,
 }
 
 //Currently only CPU impl
-template <typename Type, typename Size>
-template <engine_method EngineType>
-void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
-                                  Size count,
-                                  Type* dst,
-                                  dpc_engine<EngineType>& engine_,
-                                  const event_vector& deps) {
+template <typename Type, typename Size, engine_method EngineType>
+void shuffle_gpu(sycl::queue& queue,
+                 Size count,
+                 Type* dst,
+                 dpc_engine<EngineType>& engine_,
+                 const event_vector& deps) {
     Type idx[2];
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
@@ -82,14 +79,14 @@ void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
     }
 }
 
-#define INSTANTIATE_(F, Size, EngineType)                                                  \
-    template ONEDAL_EXPORT void rng<F, Size>::uniform_gpu(sycl::queue& queue,              \
-                                                          Size count_,                     \
-                                                          F* dst,                          \
-                                                          dpc_engine<EngineType>& engine_, \
-                                                          F a,                             \
-                                                          F b,                             \
-                                                          const event_vector& deps);
+#define INSTANTIATE_(F, Size, EngineType)                                    \
+    template ONEDAL_EXPORT void uniform_gpu(sycl::queue& queue,              \
+                                            Size count_,                     \
+                                            F* dst,                          \
+                                            dpc_engine<EngineType>& engine_, \
+                                            F a,                             \
+                                            F b,                             \
+                                            const event_vector& deps);
 
 #define INSTANTIATE_FLOAT_(Size)                             \
     INSTANTIATE_(float, Size, engine_method::mt2203)         \
@@ -111,16 +108,15 @@ void rng<Type, Size>::shuffle_gpu(sycl::queue& queue,
 INSTANTIATE_FLOAT_(std::int64_t);
 INSTANTIATE_FLOAT_(std::int32_t);
 
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)       \
-    template ONEDAL_EXPORT void rng<F, Size>::uniform_without_replacement_gpu( \
-        sycl::queue& queue,                                                    \
-        Size count_,                                                           \
-        F* dst,                                                                \
-        F* buff,                                                               \
-        dpc_engine<EngineType>& engine_,                                       \
-        F a,                                                                   \
-        F b,                                                                   \
-        const event_vector& deps);
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void uniform_without_replacement_gpu(sycl::queue& queue,              \
+                                                                Size count_,                     \
+                                                                F* dst,                          \
+                                                                F* buff,                         \
+                                                                dpc_engine<EngineType>& engine_, \
+                                                                F a,                             \
+                                                                F b,                             \
+                                                                const event_vector& deps);
 
 #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                             \
     INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203)         \
@@ -142,12 +138,12 @@ INSTANTIATE_FLOAT_(std::int32_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
 
-#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                                           \
-    template ONEDAL_EXPORT void rng<F, Size>::shuffle_gpu(sycl::queue& queue,              \
-                                                          Size count_,                     \
-                                                          F* dst,                          \
-                                                          dpc_engine<EngineType>& engine_, \
-                                                          const event_vector& deps);
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                             \
+    template ONEDAL_EXPORT void shuffle_gpu(sycl::queue& queue,              \
+                                            Size count_,                     \
+                                            F* dst,                          \
+                                            dpc_engine<EngineType>& engine_, \
+                                            const event_vector& deps);
 
 #define INSTANTIATE_SHUFFLE_FLOAT(Size)                          \
     INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203)        \
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 244ac91c72e..3a96d6780c1 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -68,11 +68,6 @@ class rng_test : public te::policy_fixture {
     using EngineType = std::tuple_element_t<1, TestType>;
     static constexpr auto engine_qq = engine_v<EngineType>;
 
-    auto get_rng() const {
-        rng<DataType> rn_gen;
-        return rn_gen;
-    }
-
     auto get_host_engine(std::int64_t seed) {
         auto rng_engine = host_engine<engine_qq>(seed);
         return rng_engine;
@@ -109,7 +104,7 @@ class rng_test : public te::policy_fixture {
     }
 };
 
-using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
+using rng_types = COMBINE_TYPES((float), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
 
 TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     SKIP_IF(this->get_policy().is_cpu());
@@ -121,12 +116,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rn_gen = this->get_rng();
     auto rng_engine = this->get_engine(seed);
     auto rng_engine_ = this->get_engine(seed);
 
-    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
-    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
 
     this->check_results(arr_gpu, arr_host);
 }
@@ -174,15 +168,14 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rn_gen = this->get_rng();
     auto rng_engine = this->get_engine(seed);
     auto rng_engine_2 = this->get_engine(seed);
 
-    rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
-    rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
+    uniform_cpu<float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
 
-    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_host_init_1, arr_host_init_2);
     this->check_results(arr_gpu, arr_host);
@@ -204,25 +197,24 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rn_gen = this->get_rng();
     auto rng_engine = this->get_engine(seed);
     auto rng_engine_2 = this->get_engine(seed);
 
-    rn_gen.uniform_gpu(this->get_queue(),
+    uniform_gpu<float>(this->get_queue(),
                        elem_count,
                        arr_device_init_1_ptr,
                        rng_engine,
                        0,
                        elem_count);
-    rn_gen.uniform_gpu(this->get_queue(),
+    uniform_gpu<float>(this->get_queue(),
                        elem_count,
                        arr_device_init_2_ptr,
                        rng_engine_2,
                        0,
                        elem_count);
 
-    rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_device_init_1, arr_device_init_2);
     this->check_results(arr_gpu, arr_host);
diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst
index 1da1fd7bc15..3d32c4532fb 100644
--- a/docs/source/daal/algorithms/engines/mrg32k3a.rst
+++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst
@@ -1,5 +1,5 @@
 .. ******************************************************************************
-.. * Copyright 2020 Intel Corporation
+.. * Copyright contributors to the oneDAL project
 .. *
 .. * Licensed under the Apache License, Version 2.0 (the "License");
 .. * you may not use this file except in compliance with the License.

From 76967f3cacfe591d3ebba53dc1780711899626eb Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 17 Dec 2024 02:29:58 -0800
Subject: [PATCH 13/18] minor fixes

---
 .../engines/mcg59/mcg59_batch_impl.h          |  7 +-
 .../engines/mrg32k3a/mrg32k3a_batch_impl.h    |  7 +-
 .../engines/mt19937/mt19937_batch_impl.h      |  7 +-
 .../engines/mt2203/mt2203_batch_impl.h        |  7 +-
 .../philox4x32x10/philox4x32x10_batch_impl.h  |  7 +-
 cpp/daal/src/externals/service_rng_mkl.h      |  4 ++
 cpp/daal/src/externals/service_rng_openrng.h  |  4 ++
 cpp/daal/src/externals/service_rng_ref.h      | 16 +++--
 .../dal/backend/primitives/rng/rng_dpc.cpp    |  2 +-
 .../backend/primitives/rng/test/rng_dpc.cpp   |  2 +-
 .../daal/algorithms/engines/philox4x32x10.rst | 64 +++++++++++++++++++
 11 files changed, 94 insertions(+), 33 deletions(-)
 create mode 100644 docs/source/daal/algorithms/engines/philox4x32x10.rst

diff --git a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
index 6c3040da615..62f337ba9a0 100644
--- a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batch<algorithm
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -78,7 +75,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batch<algorithm
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
index 469ec92a0ab..9c226e54af3 100644
--- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -68,7 +65,7 @@ class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algori
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -79,7 +76,7 @@ class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algori
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
index e92d0e46612..805ded3153c 100644
--- a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mt19937::interface1::Batch<algorit
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -78,7 +75,7 @@ class BatchImpl : public algorithms::engines::mt19937::interface1::Batch<algorit
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
index ca8c01efd5f..4899104aff9 100644
--- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
@@ -27,9 +27,6 @@
 #include "src/data_management/service_numeric_table.h"
 #include "services/collection.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -188,7 +185,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::Batch<algorith
     {
         int errcode = _streams[i]->leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -199,7 +196,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::Batch<algorith
     {
         int errcode = _streams[i]->skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
index 58e28eb47bf..1f7b40526ac 100644
--- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -68,7 +65,7 @@ class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<a
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -79,7 +76,7 @@ class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<a
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h
index 83edae913e2..425695c7f66 100644
--- a/cpp/daal/src/externals/service_rng_mkl.h
+++ b/cpp/daal/src/externals/service_rng_mkl.h
@@ -41,6 +41,10 @@
 #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       VSL_RNG_METHOD_GAUSSIAN_ICDF
 
+// Errors
+#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
+
 namespace daal
 {
 namespace internal
diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h
index 3d1b9833a52..0e49c62c83b 100644
--- a/cpp/daal/src/externals/service_rng_openrng.h
+++ b/cpp/daal/src/externals/service_rng_openrng.h
@@ -34,6 +34,10 @@
 #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       VSL_RNG_METHOD_GAUSSIAN_ICDF
 
+// Errors
+#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
+
 namespace daal
 {
 namespace internal
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index 6b7aa53359e..f2df997c87d 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -42,12 +42,16 @@
     #define __DAAL_BRNG_MRG32K3A      (1 << 20) * 3  //VSL_BRNG_MRG32K3A
     #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
 
-    #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
-    #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
-    #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
-    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER  0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER
-    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
-    #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       2 //VSL_RNG_METHOD_GAUSSIAN_ICDF
+    #define __DAAL_RNG_METHOD_UNIFORM_STD          0 //VSL_RNG_METHOD_UNIFORM_STD
+    #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD    4
+    #define __DAAL_RNG_METHOD_BERNOULLI_ICDF       0 //VSL_RNG_METHOD_BERNOULLI_ICDF
+    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER   0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER
+    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2  1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
+    #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF        2 //VSL_RNG_METHOD_GAUSSIAN_ICDF
+
+    // Errors
+    #define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+    #define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
 
 namespace daal
 {
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index ec586ec1697..6fc39290f96 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 3a96d6780c1..466ce350e9b 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/docs/source/daal/algorithms/engines/philox4x32x10.rst b/docs/source/daal/algorithms/engines/philox4x32x10.rst
new file mode 100644
index 00000000000..83e1a4a2c3f
--- /dev/null
+++ b/docs/source/daal/algorithms/engines/philox4x32x10.rst
@@ -0,0 +1,64 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+philox4x32x10
+=============
+
+Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
+that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
+
+.. rubric:: Subsequence selection methods support
+
+skipAhead (nskip)
+    Supported
+leapfrog (threadIdx, nThreads)
+    Supported
+
+Batch Processing
+****************
+
+philox4x32x10 engine needs the initial condition (``seed``) for state initialization.
+The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors.
+
+.. rubric:: Algorithm Parameters
+
+philox4x32x10 engine has the following parameters:
+
+.. tabularcolumns::  |\Y{0.2}|\Y{0.2}|\Y{0.6}|
+
+.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing)
+   :header-rows: 1
+   :widths: 10 20 30
+   :align: left
+   :class: longtable
+
+   * - Parameter
+     - Default Value
+     - Description
+   * - ``algorithmFPType``
+     - ``float``
+     - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``.
+   * - ``method``
+     - ``defaultDense``
+     - Performance-oriented computation method; the only method supported by the algorithm.
+   * - ``seed``
+     -
+       - :math:`777` for a scalar seed
+       - NA for a vector seed
+     - Initial condition for state initialization, scalar or vector:
+
+       - Scalar, value of ``size_t`` type
+       - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p`

From 917fa3d5165f667b923eca2d8632eb9bc0b04e2f Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 17 Dec 2024 03:46:58 -0800
Subject: [PATCH 14/18] fixes

---
 .../backend/gpu/train_kernel_hist_impl.hpp    | 14 ++--
 .../gpu/train_kernel_hist_impl_dpc.cpp        | 16 ++--
 .../optimizers/test/newton_cg_dpc.cpp         |  4 +-
 .../dal/backend/primitives/rng/dpc_engine.hpp |  2 +-
 .../backend/primitives/rng/host_engine.hpp    |  6 +-
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 12 +--
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 77 +++++++++---------
 .../backend/primitives/rng/test/rng_dpc.cpp   | 81 +++++++------------
 .../daal/algorithms/engines/mrg32k3a.rst      | 28 +++----
 .../daal/algorithms/engines/philox4x32x10.rst | 28 +++----
 10 files changed, 123 insertions(+), 145 deletions(-)

diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
index ac04f73d89f..f677e69b615 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -51,7 +51,7 @@ class train_kernel_hist_impl {
     using train_context_t = train_context<Float, Index, Task>;
     using imp_data_t = impurity_data<Float, Index, Task>;
     using rng_engine_t = pr::host_engine<pr::engine_method::mt2203>;
-    using rng_engine_method_t = std::vector<rng_engine_t>;
+    using rng_engine_list_t = std::vector<rng_engine_t>;
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
     using node_t = node<Index>;
@@ -79,7 +79,7 @@ class train_kernel_hist_impl {
                                           Index class_count) const;
 
     sycl::event gen_initial_tree_order(train_context_t& ctx,
-                                       rng_engine_method_t& rng_engine_method,
+                                       rng_engine_list_t& rng_engine_list,
                                        pr::ndarray<Index, 1>& node_list,
                                        pr::ndarray<Index, 1>& tree_order_level,
                                        Index engine_offset,
@@ -115,12 +115,12 @@ class train_kernel_hist_impl {
     /// @param[in] ctx              a training context structure for a GPU backend
     /// @param[in] node_count       number of nodes on the current level
     /// @param[in] node_vs_tree_map an initial tree order
-    /// @param[in] rng_engine_method  a list of random generator engines
+    /// @param[in] rng_engine_list  a list of random generator engines
     std::tuple<pr::ndarray<Index, 1>, sycl::event> gen_feature_list(
         const train_context_t& ctx,
         Index node_count,
         const pr::ndarray<Index, 1>& node_vs_tree_map,
-        rng_engine_method_t& rng_engine_method);
+        rng_engine_list_t& rng_engine_list);
 
     /// Generates random thresholds for each node and for each selected feature for node.
     /// Thresholds are used for a random splitter kernel to split each node.
@@ -129,12 +129,12 @@ class train_kernel_hist_impl {
     /// @param[in] ctx              a training context structure for a GPU backend
     /// @param[in] node_count       number of nodes on the current level
     /// @param[in] node_vs_tree_map an initial tree order
-    /// @param[in] rng_engine_method  a list of random generator engines
+    /// @param[in] rng_engine_list  a list of random generator engines
     std::tuple<pr::ndarray<Float, 1>, sycl::event> gen_random_thresholds(
         const train_context_t& ctx,
         Index node_count,
         const pr::ndarray<Index, 1>& node_vs_tree_map,
-        rng_engine_method_t& rng_engine_method);
+        rng_engine_list_t& rng_engine_list);
 
     /// Computes initial impurity for each node.
     ///
@@ -575,7 +575,7 @@ class train_kernel_hist_impl {
                                 pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
                                 pr::ndarray<Float, 1>& var_imp,
                                 pr::ndarray<Float, 1>& var_imp_variance,
-                                const rng_engine_method_t& rng_engine_arr,
+                                const rng_engine_list_t& rng_engine_arr,
                                 Index tree_idx,
                                 Index tree_in_block,
                                 Index built_tree_count,
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index cd6659d9814..0e6d4bc3a36 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -368,7 +368,7 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::allocate_buffers(const tra
 template <typename Float, typename Bin, typename Index, typename Task>
 sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_order(
     train_context_t& ctx,
-    rng_engine_method_t& rng_engine_method,
+    rng_engine_list_t& rng_engine_list,
     pr::ndarray<Index, 1>& node_list_host,
     pr::ndarray<Index, 1>& tree_order_level,
     Index engine_offset,
@@ -400,7 +400,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
             pr::uniform_cpu<Index>(ctx.selected_row_total_count_,
                                    gen_row_idx_global_ptr,
-                                   rng_engine_method[engine_offset + node_idx],
+                                   rng_engine_list[engine_offset + node_idx],
                                    0,
                                    ctx.row_total_count_);
 
@@ -464,7 +464,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
     const train_context_t& ctx,
     Index node_count,
     const pr::ndarray<Index, 1>& node_vs_tree_map_list,
-    rng_engine_method_t& rng_engine_method) {
+    rng_engine_list_t& rng_engine_list) {
     ONEDAL_PROFILER_TASK(gen_feature_list, queue_);
 
     ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count);
@@ -489,7 +489,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
-                rng_engine_method[tree_map_ptr[node]],
+                rng_engine_list[tree_map_ptr[node]],
                 0,
                 ctx.column_count_);
         }
@@ -515,7 +515,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     const train_context_t& ctx,
     Index node_count,
     const pr::ndarray<Index, 1>& node_vs_tree_map,
-    rng_engine_method_t& rng_engine_method) {
+    rng_engine_list_t& rng_engine_list) {
     ONEDAL_PROFILER_TASK(gen_random_thresholds, queue_);
 
     ONEDAL_ASSERT(node_vs_tree_map.get_count() == node_count);
@@ -536,7 +536,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
     for (Index node = 0; node < node_count; ++node) {
         pr::uniform_cpu<Float>(ctx.selected_ftr_count_,
                                random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                               rng_engine_method[tree_map_ptr[node]],
+                               rng_engine_list[tree_map_ptr[node]],
                                0.0f,
                                1.0f);
     }
@@ -1610,7 +1610,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
     pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
     pr::ndarray<Float, 1>& var_imp,
     pr::ndarray<Float, 1>& var_imp_variance,
-    const rng_engine_method_t& engine_arr,
+    const rng_engine_list_t& engine_arr,
     Index tree_idx_in_block,
     Index tree_in_block_count,
     Index built_tree_count,
@@ -1854,7 +1854,7 @@ train_result<Task> train_kernel_hist_impl<Float, Bin, Index, Task>::operator()(
     de::check_mul_overflow<std::size_t>((ctx.tree_count_ - 1), skip_num);
 
     pr::engine_collection collection(ctx.tree_count_, desc.get_seed());
-    rng_engine_method_t engine_arr = collection([&](std::size_t i, std::size_t& skip) {
+    rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) {
         skip = i * skip_num;
     });
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index a4c0c1ebed3..bce7df11d0e 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -58,8 +58,8 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
 
         primitives::host_engine eng(2007 + n);
-        primitives::rnguniform_cpu<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
-        primitives::rnguniform_cpu<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
+        primitives::uniform_cpu<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        primitives::uniform_cpu<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
index 1f13975e8d6..164a1578490 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
@@ -88,7 +88,7 @@ class dpc_engine {
     }
 
     void skip_ahead_gpu(size_t nSkip) {
-        // Will be fixed in the next oneMKL release.
+        // Will be supported in the next oneMKL release.
         if constexpr (EngineType == engine_method::mt2203) {
         }
         else {
diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
index 436e032e608..36779186413 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/backend/primitives/rng/utils.hpp"
+
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
-#include "oneapi/dal/backend/primitives/rng/rng.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
+
 namespace oneapi::dal::backend::primitives {
 
 template <engine_method EngineType = engine_method::mt2203>
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index b935d338ff9..b55bfa517a7 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -31,6 +31,7 @@ void uniform_cpu(Size count, Type* dst, host_engine<EngineType>& host_engine, Ty
     auto state = host_engine.get_host_engine_state();
     uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
 }
+
 template <typename Type, typename Size, engine_method EngineType>
 void uniform_without_replacement_cpu(Size count,
                                      Type* dst,
@@ -48,8 +49,8 @@ template <typename Type,
           typename T = Type,
           typename = std::enable_if_t<std::is_integral_v<T>>>
 void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
-    Type idx[2];
     auto state = host_engine.get_host_engine_state();
+    Type idx[2];
     for (Size i = 0; i < count; ++i) {
         uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
         std::swap(dst[idx[0]], dst[idx[1]]);
@@ -64,8 +65,8 @@ void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a,
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
     auto state = engine_.get_host_engine_state();
-    engine_.skip_ahead_gpu(count);
     uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+    engine_.skip_ahead_gpu(count);
 }
 
 template <typename Type, typename Size, engine_method EngineType>
@@ -80,8 +81,8 @@ void uniform_without_replacement_cpu(Size count,
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
     void* state = engine_.get_host_engine_state();
-    engine_.skip_ahead_gpu(count);
     uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+    engine_.skip_ahead_gpu(count);
 }
 
 template <typename Type,
@@ -90,18 +91,17 @@ template <typename Type,
           typename T = Type,
           typename = std::enable_if_t<std::is_integral_v<T>>>
 void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
-    Type idx[2];
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
+    Type idx[2];
     void* state = engine_.get_host_engine_state();
-    engine_.skip_ahead_gpu(count);
-
     for (Size i = 0; i < count; ++i) {
         uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
         std::swap(dst[idx[0]], dst[idx[1]]);
     }
+    engine_.skip_ahead_gpu(count);
 }
 
 template <typename Type, typename Size, engine_method EngineType>
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 6fc39290f96..1c162ca0e5b 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -88,23 +88,22 @@ void shuffle_gpu(sycl::queue& queue,
                                             F b,                             \
                                             const event_vector& deps);
 
-#define INSTANTIATE_FLOAT_(Size)                             \
-    INSTANTIATE_(float, Size, engine_method::mt2203)         \
-    INSTANTIATE_(float, Size, engine_method::mcg59)          \
-    INSTANTIATE_(float, Size, engine_method::mrg32k3a)       \
-    INSTANTIATE_(float, Size, engine_method::philox4x32x10)  \
-    INSTANTIATE_(float, Size, engine_method::mt19937)        \
-    INSTANTIATE_(double, Size, engine_method::mt2203)        \
-    INSTANTIATE_(double, Size, engine_method::mcg59)         \
-    INSTANTIATE_(double, Size, engine_method::mrg32k3a)      \
-    INSTANTIATE_(double, Size, engine_method::philox4x32x10) \
-    INSTANTIATE_(double, Size, engine_method::mt19937)       \
-    INSTANTIATE_(int, Size, engine_method::mt2203)           \
-    INSTANTIATE_(int, Size, engine_method::mcg59)            \
-    INSTANTIATE_(int, Size, engine_method::mrg32k3a)         \
-    INSTANTIATE_(int, Size, engine_method::philox4x32x10)    \
-    INSTANTIATE_(int, Size, engine_method::mt19937)
-
+#define INSTANTIATE_FLOAT_(Size)                                   \
+    INSTANTIATE_(float, Size, engine_method::mt2203)               \
+    INSTANTIATE_(float, Size, engine_method::mcg59)                \
+    INSTANTIATE_(float, Size, engine_method::mrg32k3a)             \
+    INSTANTIATE_(float, Size, engine_method::philox4x32x10)        \
+    INSTANTIATE_(float, Size, engine_method::mt19937)              \
+    INSTANTIATE_(double, Size, engine_method::mt2203)              \
+    INSTANTIATE_(double, Size, engine_method::mcg59)               \
+    INSTANTIATE_(double, Size, engine_method::mrg32k3a)            \
+    INSTANTIATE_(double, Size, engine_method::philox4x32x10)       \
+    INSTANTIATE_(double, Size, engine_method::mt19937)             \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mt2203)        \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mcg59)         \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_(std::int32_t, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mt19937)
 INSTANTIATE_FLOAT_(std::int64_t);
 INSTANTIATE_FLOAT_(std::int32_t);
 
@@ -118,22 +117,22 @@ INSTANTIATE_FLOAT_(std::int32_t);
                                                                 F b,                             \
                                                                 const event_vector& deps);
 
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                             \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59)          \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a)       \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10)  \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a)      \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937)       \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt2203)           \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mcg59)            \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mrg32k3a)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::philox4x32x10)    \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt19937)
+#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                                   \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203)               \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59)                \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a)             \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937)              \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203)              \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59)               \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a)            \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10)       \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937)             \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt2203)        \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mcg59)         \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt19937)
 
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
 INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
@@ -145,12 +144,12 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
                                             dpc_engine<EngineType>& engine_, \
                                             const event_vector& deps);
 
-#define INSTANTIATE_SHUFFLE_FLOAT(Size)                          \
-    INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203)        \
-    INSTANTIATE_SHUFFLE(int, Size, engine_method::mcg59)         \
-    INSTANTIATE_SHUFFLE(int, Size, engine_method::mrg32k3a)      \
-    INSTANTIATE_SHUFFLE(int, Size, engine_method::philox4x32x10) \
-    INSTANTIATE_SHUFFLE(int, Size, engine_method::mt19937)
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                                   \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203)        \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mcg59)         \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt19937)
 
 INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
 INSTANTIATE_SHUFFLE_FLOAT(std::int32_t);
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 466ce350e9b..7e07b65f411 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -66,28 +66,26 @@ class rng_test : public te::policy_fixture {
 public:
     using DataType = std::tuple_element_t<0, TestType>;
     using EngineType = std::tuple_element_t<1, TestType>;
-    static constexpr auto engine_qq = engine_v<EngineType>;
+    static constexpr auto engine_test_type = engine_v<EngineType>;
 
     auto get_host_engine(std::int64_t seed) {
-        auto rng_engine = host_engine<engine_qq>(seed);
+        auto rng_engine = host_engine<engine_test_type>(seed);
         return rng_engine;
     }
 
-    auto get_engine(std::int64_t seed) {
-        auto rng_engine = dpc_engine<engine_qq>(this->get_queue(), seed);
+    auto get_dpc_engine(std::int64_t seed) {
+        auto rng_engine = dpc_engine<engine_test_type>(this->get_queue(), seed);
         return rng_engine;
     }
 
     auto allocate_array_host(std::int64_t elem_count) {
         auto arr_host = ndarray<DataType, 1>::empty({ elem_count });
-
         return arr_host;
     }
 
     auto allocate_array_device(std::int64_t elem_count) {
         auto& q = this->get_queue();
         auto arr_gpu = ndarray<DataType, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
-
         return arr_gpu;
     }
 
@@ -99,15 +97,18 @@ class rng_test : public te::policy_fixture {
         const DataType* val_arr_2_host_ptr = arr_2_host.get_data();
 
         for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) {
-            REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 1);
+            // Due to MKL inside generates floats on GPU and doubles on CPU, it makes sense to add minor eps.
+            REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 0.1);
         }
     }
 };
 
-using rng_types = COMBINE_TYPES((float), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
+using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
 
 TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000);
     std::int64_t seed = GENERATE_COPY(777, 999);
 
@@ -116,44 +117,22 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rng_engine = this->get_engine(seed);
-    auto rng_engine_ = this->get_engine(seed);
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_ = this->get_dpc_engine(seed);
 
-    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
-    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
 
     this->check_results(arr_gpu, arr_host);
 }
 
-using rng_types_skip_ahead_support = COMBINE_TYPES((float),
+using rng_types_skip_ahead_support = COMBINE_TYPES((float, double),
                                                    (mt19937, mcg59, mrg32k3a, philox4x32x10));
 
-// //Just for perf tests
-// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) {
-//     SKIP_IF(this->get_policy().is_cpu());
-//     std::int64_t elem_count = GENERATE_COPY(10000);
-//     std::int64_t seed = GENERATE_COPY(777);
-
-//     auto arr_host = this->allocate_array_host(elem_count);
-//     auto arr_host_ptr_ = arr_host.get_mutable_data();
-
-//     auto arr_host_fake = this->allocate_array_host(1);
-//     auto arr_host_ptr_fake = arr_host_fake.get_mutable_data();
-//     auto rn_gen_ = this->get_rng();
-//     auto rng_engine_1 = this->get_engine(seed);
-
-//     BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) {
-//         rn_gen_.uniform_without_replacement_cpu(elem_count,
-//                                                 arr_host_ptr_,
-//                                                 arr_host_ptr_fake,
-//                                                 rng_engine_1,
-//                                                 0,
-//                                                 elem_count);
-//     };
-// }
-
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000);
     std::int64_t seed = GENERATE_COPY(777, 999);
 
@@ -168,14 +147,14 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rng_engine = this->get_engine(seed);
-    auto rng_engine_2 = this->get_engine(seed);
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_2 = this->get_dpc_engine(seed);
 
-    uniform_cpu<float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
+    uniform_cpu<Float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<Float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
 
-    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_host_init_1, arr_host_init_2);
     this->check_results(arr_gpu, arr_host);
@@ -183,6 +162,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
     std::int64_t seed = GENERATE_COPY(1, 777, 999);
 
@@ -197,24 +178,24 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
     auto arr_gpu_ptr = arr_gpu.get_mutable_data();
     auto arr_host_ptr = arr_host.get_mutable_data();
 
-    auto rng_engine = this->get_engine(seed);
-    auto rng_engine_2 = this->get_engine(seed);
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_2 = this->get_dpc_engine(seed);
 
-    uniform_gpu<float>(this->get_queue(),
+    uniform_gpu<Float>(this->get_queue(),
                        elem_count,
                        arr_device_init_1_ptr,
                        rng_engine,
                        0,
                        elem_count);
-    uniform_gpu<float>(this->get_queue(),
+    uniform_gpu<Float>(this->get_queue(),
                        elem_count,
                        arr_device_init_2_ptr,
                        rng_engine_2,
                        0,
                        elem_count);
 
-    uniform_gpu<float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_device_init_1, arr_device_init_2);
     this->check_results(arr_gpu, arr_host);
@@ -228,7 +209,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
 
 //     engine_collection<std::int64_t,engine_method::mcg59> collection(this->get_queue(), 2, seed);
 
-//     auto engine_arr = collection.get_engines();
+//     auto engine_arr = collection.get_dpc_engines();
 
 //     auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count);
 
diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst
index 3d32c4532fb..e931c801890 100644
--- a/docs/source/daal/algorithms/engines/mrg32k3a.rst
+++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst
@@ -1,18 +1,16 @@
-.. ******************************************************************************
-.. * Copyright contributors to the oneDAL project
-.. *
-.. * Licensed under the Apache License, Version 2.0 (the "License");
-.. * you may not use this file except in compliance with the License.
-.. * You may obtain a copy of the License at
-.. *
-.. *     http://www.apache.org/licenses/LICENSE-2.0
-.. *
-.. * Unless required by applicable law or agreed to in writing, software
-.. * distributed under the License is distributed on an "AS IS" BASIS,
-.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-.. * See the License for the specific language governing permissions and
-.. * limitations under the License.
-.. *******************************************************************************/
+.. Copyright contributors to the oneDAL project
+..
+.. Licensed under the Apache License, Version 2.0 (the "License");
+.. you may not use this file except in compliance with the License.
+.. You may obtain a copy of the License at
+..
+..     http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
 
 mrg32k3a
 ========
diff --git a/docs/source/daal/algorithms/engines/philox4x32x10.rst b/docs/source/daal/algorithms/engines/philox4x32x10.rst
index 83e1a4a2c3f..ac50ea80fdb 100644
--- a/docs/source/daal/algorithms/engines/philox4x32x10.rst
+++ b/docs/source/daal/algorithms/engines/philox4x32x10.rst
@@ -1,18 +1,16 @@
-.. ******************************************************************************
-.. * Copyright contributors to the oneDAL project
-.. *
-.. * Licensed under the Apache License, Version 2.0 (the "License");
-.. * you may not use this file except in compliance with the License.
-.. * You may obtain a copy of the License at
-.. *
-.. *     http://www.apache.org/licenses/LICENSE-2.0
-.. *
-.. * Unless required by applicable law or agreed to in writing, software
-.. * distributed under the License is distributed on an "AS IS" BASIS,
-.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-.. * See the License for the specific language governing permissions and
-.. * limitations under the License.
-.. *******************************************************************************/
+.. Copyright contributors to the oneDAL project
+..
+.. Licensed under the Apache License, Version 2.0 (the "License");
+.. you may not use this file except in compliance with the License.
+.. You may obtain a copy of the License at
+..
+..     http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
 
 philox4x32x10
 =============

From 06d9f821da1ebc2f2f69399e0c80fc8fa1fc45e5 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Tue, 17 Dec 2024 06:37:48 -0800
Subject: [PATCH 15/18] add comments and minor renaming

---
 .../vertex_partitioning_default_kernel.hpp    |  10 +-
 .../gpu/train_kernel_hist_impl_dpc.cpp        |  28 ++--
 .../vertex_partitioning_default_kernel.hpp    |   2 +-
 .../objective_function/test/fixture.hpp       |   2 +-
 .../objective_function/test/spmd_fixture.hpp  |   2 +-
 .../optimizers/test/cg_solver_dpc.cpp         |   2 +-
 .../primitives/optimizers/test/fixture.hpp    |   4 +-
 .../optimizers/test/newton_cg_dpc.cpp         |   8 +-
 .../dal/backend/primitives/rng/dpc_engine.hpp |  15 ++
 .../backend/primitives/rng/host_engine.hpp    |  11 ++
 cpp/oneapi/dal/backend/primitives/rng/rng.hpp |  72 ++++----
 .../dal/backend/primitives/rng/rng_dpc.cpp    | 157 +++++++++++-------
 .../backend/primitives/rng/test/rng_dpc.cpp   |  37 ++---
 13 files changed, 200 insertions(+), 150 deletions(-)

diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index 439fa4665d5..bdcc3f1487a 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -91,11 +91,11 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
     dal::backend::primitives::host_engine eng;
-    dal::backend::primitives::uniform_cpu<std::int32_t>(samples_count,
-                                                        rnd_vertex_ids,
-                                                        eng,
-                                                        0,
-                                                        vertex_count);
+    dal::backend::primitives::uniform<std::int32_t>(samples_count,
+                                                    rnd_vertex_ids,
+                                                    eng,
+                                                    0,
+                                                    vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index 0e6d4bc3a36..21a9cc440d0 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -398,11 +398,11 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
         for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
             Index* gen_row_idx_global_ptr =
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
-            pr::uniform_cpu<Index>(ctx.selected_row_total_count_,
-                                   gen_row_idx_global_ptr,
-                                   rng_engine_list[engine_offset + node_idx],
-                                   0,
-                                   ctx.row_total_count_);
+            pr::uniform<Index>(ctx.selected_row_total_count_,
+                               gen_row_idx_global_ptr,
+                               rng_engine_list[engine_offset + node_idx],
+                               0,
+                               ctx.row_total_count_);
 
             if (ctx.distr_mode_) {
                 Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
@@ -485,7 +485,7 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
     if (ctx.selected_ftr_count_ != ctx.column_count_) {
         for (Index node = 0; node < node_count; ++node) {
-            pr::uniform_without_replacement_cpu<Index>(
+            pr::uniform_without_replacement<Index>(
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
@@ -534,11 +534,11 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     // Generate random bins for selected features
     for (Index node = 0; node < node_count; ++node) {
-        pr::uniform_cpu<Float>(ctx.selected_ftr_count_,
-                               random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                               rng_engine_list[tree_map_ptr[node]],
-                               0.0f,
-                               1.0f);
+        pr::uniform<Float>(ctx.selected_ftr_count_,
+                           random_bins_host_ptr + node * ctx.selected_ftr_count_,
+                           rng_engine_list[tree_map_ptr[node]],
+                           0.0f,
+                           1.0f);
     }
     auto event_rnd_generate =
         random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count());
@@ -1658,9 +1658,9 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
             const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1);
 
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                pr::shuffle_cpu<Index>(oob_row_count,
-                                       permutation_ptr,
-                                       engine_arr[built_tree_count + tree_idx_in_block]);
+                pr::shuffle<Index>(oob_row_count,
+                                   permutation_ptr,
+                                   engine_arr[built_tree_count + tree_idx_in_block]);
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index 70ceb84ac6e..e287c3f2f66 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    uniform_cpu<std::int32_t>(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
+    uniform<std::int32_t>(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index 6a1247a67c4..6cf2b73ccd6 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
             primitives::host_engine eng(2007 + dim * num_checks + ij);
-            pr::uniform_cpu<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
+            pr::uniform<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index cf3a2426dd6..63ab0a07c13 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -105,7 +105,7 @@ class logloss_spmd_test : public logloss_test<Param> {
             host_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
                 (ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host));
-            uniform_cpu<float_t>(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
+            uniform<float_t>(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
             vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue());
         }
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index 27af73de1e9..b529836f70e 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -44,7 +44,7 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         primitives::host_engine eng(4014 + n_);
-        primitives::uniform_cpu<float_t>(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
+        primitives::uniform<float_t>(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index e941c971302..c188c50983c 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -135,8 +135,8 @@ void create_stable_matrix(sycl::queue& queue,
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
     primitives::host_engine eng(2007 + n);
 
-    primitives::uniform_cpu<Float>(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
-    primitives::uniform_cpu<Float>(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
+    primitives::uniform<Float>(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
+    primitives::uniform<Float>(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
 
     // orthogonalize matrix J
     gram_schmidt(J);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index bce7df11d0e..b2ebe9f5bdb 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -58,8 +58,8 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
 
         primitives::host_engine eng(2007 + n);
-        primitives::uniform_cpu<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
-        primitives::uniform_cpu<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
+        primitives::uniform<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        primitives::uniform<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
@@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         primitives::engine eng(4014 + n_);
-        uniform_cpu<float_t>(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
+        uniform<float_t>(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            uniform_cpu<float_t>(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
+            uniform<float_t>(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
index 164a1578490..9b9745f4cfa 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
@@ -53,6 +53,21 @@ struct dpc_engine_type<engine_method::philox4x32x10> {
     using type = oneapi::mkl::rng::philox4x32x10;
 };
 
+/// A class that provides a unified interface for random number generation on both CPU and GPU devices.
+///
+/// This class serves as a wrapper for random number generators (RNGs) that supports different engine types,
+/// enabling efficient random number generation on heterogeneous platforms using SYCL. It integrates a host
+/// (CPU) engine and a device (GPU) engine, allowing operations to be executed seamlessly on the appropriate
+/// device.
+///
+/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`.
+///
+/// @param[in] queue The SYCL queue used to manage device operations.
+/// @param[in] seed  The initial seed for the random number generator. Defaults to `777`.
+///
+/// The class provides functionality to skip ahead in the RNG sequence, retrieve engine states, and
+/// manage host and device engines independently. Support for `skip_ahead` on GPU is currently limited for
+/// some engine types.
 template <engine_method EngineType = engine_method::mt2203>
 class dpc_engine {
 public:
diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
index 36779186413..c4b2c807674 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
@@ -26,6 +26,17 @@
 
 namespace oneapi::dal::backend::primitives {
 
+/// A class that provides an interface for random number generation on the host (CPU) only.
+///
+/// This class serves as a wrapper for host-based random number generators (RNGs), supporting multiple engine
+/// types for flexible and efficient random number generation on CPU. It abstracts the underlying engine
+/// implementation and provides an interface to manage and retrieve the engine's state.
+///
+/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`.
+///
+/// @param[in] seed  The initial seed for the random number generator. Defaults to `777`.
+///
+/// @note The class only supports host-based RNG and does not require a SYCL queue or device context.
 template <engine_method EngineType = engine_method::mt2203>
 class host_engine {
 public:
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
index b55bfa517a7..83125ba73e7 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -27,18 +27,18 @@
 namespace oneapi::dal::backend::primitives {
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_cpu(Size count, Type* dst, host_engine<EngineType>& host_engine, Type a, Type b) {
+void uniform(Size count, Type* dst, host_engine<EngineType>& host_engine, Type a, Type b) {
     auto state = host_engine.get_host_engine_state();
     uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
 }
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_without_replacement_cpu(Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     host_engine<EngineType> host_engine,
-                                     Type a,
-                                     Type b) {
+void uniform_without_replacement(Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 host_engine<EngineType> host_engine,
+                                 Type a,
+                                 Type b) {
     auto state = host_engine.get_host_engine_state();
     uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
 }
@@ -48,7 +48,7 @@ template <typename Type,
           engine_method EngineType,
           typename T = Type,
           typename = std::enable_if_t<std::is_integral_v<T>>>
-void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
+void shuffle(Size count, Type* dst, host_engine<EngineType> host_engine) {
     auto state = host_engine.get_host_engine_state();
     Type idx[2];
     for (Size i = 0; i < count; ++i) {
@@ -59,7 +59,7 @@ void shuffle_cpu(Size count, Type* dst, host_engine<EngineType> host_engine) {
 
 #ifdef ONEDAL_DATA_PARALLEL
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
+void uniform(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
@@ -70,12 +70,12 @@ void uniform_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a,
 }
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_without_replacement_cpu(Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     dpc_engine<EngineType>& engine_,
-                                     Type a,
-                                     Type b) {
+void uniform_without_replacement(Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
@@ -90,7 +90,7 @@ template <typename Type,
           engine_method EngineType,
           typename T = Type,
           typename = std::enable_if_t<std::is_integral_v<T>>>
-void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
+void shuffle(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
@@ -105,30 +105,30 @@ void shuffle_cpu(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
 }
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_gpu(sycl::queue& queue,
-                 Size count,
-                 Type* dst,
-                 dpc_engine<EngineType>& engine_,
-                 Type a,
-                 Type b,
-                 const event_vector& deps = {});
+void uniform(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             Type a,
+             Type b,
+             const event_vector& deps = {});
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_without_replacement_gpu(sycl::queue& queue,
-                                     Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     dpc_engine<EngineType>& engine_,
-                                     Type a,
-                                     Type b,
-                                     const event_vector& deps = {});
+void uniform_without_replacement(sycl::queue& queue,
+                                 Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b,
+                                 const event_vector& deps = {});
 
 template <typename Type, typename Size, engine_method EngineType>
-void shuffle_gpu(sycl::queue& queue,
-                 Size count,
-                 Type* dst,
-                 dpc_engine<EngineType>& engine_,
-                 const event_vector& deps = {});
+void shuffle(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             const event_vector& deps = {});
 #endif
 
 }; // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
index 1c162ca0e5b..4ad09c4cc99 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -23,13 +23,13 @@ namespace oneapi::dal::backend::primitives {
 namespace bk = oneapi::dal::backend;
 
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_gpu(sycl::queue& queue,
-                 Size count,
-                 Type* dst,
-                 dpc_engine<EngineType>& engine_,
-                 Type a,
-                 Type b,
-                 const event_vector& deps) {
+void uniform(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             Type a,
+             Type b,
+             const event_vector& deps) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
     }
@@ -41,14 +41,14 @@ void uniform_gpu(sycl::queue& queue,
 
 //Currently only CPU impl
 template <typename Type, typename Size, engine_method EngineType>
-void uniform_without_replacement_gpu(sycl::queue& queue,
-                                     Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     dpc_engine<EngineType>& engine_,
-                                     Type a,
-                                     Type b,
-                                     const event_vector& deps) {
+void uniform_without_replacement(sycl::queue& queue,
+                                 Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b,
+                                 const event_vector& deps) {
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
         throw domain_error(dal::detail::error_messages::unsupported_data_type());
@@ -60,11 +60,11 @@ void uniform_without_replacement_gpu(sycl::queue& queue,
 
 //Currently only CPU impl
 template <typename Type, typename Size, engine_method EngineType>
-void shuffle_gpu(sycl::queue& queue,
-                 Size count,
-                 Type* dst,
-                 dpc_engine<EngineType>& engine_,
-                 const event_vector& deps) {
+void shuffle(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             const event_vector& deps) {
     Type idx[2];
     if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
         sycl::usm::alloc::device) {
@@ -79,14 +79,14 @@ void shuffle_gpu(sycl::queue& queue,
     }
 }
 
-#define INSTANTIATE_(F, Size, EngineType)                                    \
-    template ONEDAL_EXPORT void uniform_gpu(sycl::queue& queue,              \
-                                            Size count_,                     \
-                                            F* dst,                          \
-                                            dpc_engine<EngineType>& engine_, \
-                                            F a,                             \
-                                            F b,                             \
-                                            const event_vector& deps);
+#define INSTANTIATE_(F, Size, EngineType)                                \
+    template ONEDAL_EXPORT void uniform(sycl::queue& queue,              \
+                                        Size count_,                     \
+                                        F* dst,                          \
+                                        dpc_engine<EngineType>& engine_, \
+                                        F a,                             \
+                                        F b,                             \
+                                        const event_vector& deps);
 
 #define INSTANTIATE_FLOAT_(Size)                                   \
     INSTANTIATE_(float, Size, engine_method::mt2203)               \
@@ -107,42 +107,71 @@ void shuffle_gpu(sycl::queue& queue,
 INSTANTIATE_FLOAT_(std::int64_t);
 INSTANTIATE_FLOAT_(std::int32_t);
 
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType)                         \
-    template ONEDAL_EXPORT void uniform_without_replacement_gpu(sycl::queue& queue,              \
-                                                                Size count_,                     \
-                                                                F* dst,                          \
-                                                                F* buff,                         \
-                                                                dpc_engine<EngineType>& engine_, \
-                                                                F a,                             \
-                                                                F b,                             \
-                                                                const event_vector& deps);
-
-#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size)                                   \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203)               \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59)                \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a)             \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937)              \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203)              \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59)               \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a)            \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10)       \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937)             \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt2203)        \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mcg59)         \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mrg32k3a)      \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::philox4x32x10) \
-    INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt19937)
-
-INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t);
-INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t);
-
-#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                             \
-    template ONEDAL_EXPORT void shuffle_gpu(sycl::queue& queue,              \
-                                            Size count_,                     \
-                                            F* dst,                          \
-                                            dpc_engine<EngineType>& engine_, \
-                                            const event_vector& deps);
+#define INSTANTIATE_uniform_without_replacement(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void uniform_without_replacement(sycl::queue& queue,              \
+                                                            Size count_,                     \
+                                                            F* dst,                          \
+                                                            F* buff,                         \
+                                                            dpc_engine<EngineType>& engine_, \
+                                                            F a,                             \
+                                                            F b,                             \
+                                                            const event_vector& deps);
+
+#define INSTANTIATE_uniform_without_replacement_FLOAT(Size)                                        \
+    INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt2203)                    \
+        INSTANTIATE_uniform_without_replacement(                                                   \
+            float,                                                                                 \
+            Size,                                                                                  \
+            engine_method::mcg59) INSTANTIATE_uniform_without_replacement(float,                   \
+                                                                          Size,                    \
+                                                                          engine_method::mrg32k3a) \
+            INSTANTIATE_uniform_without_replacement(float, Size, engine_method::philox4x32x10)     \
+                INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt19937)       \
+                    INSTANTIATE_uniform_without_replacement(double, Size, engine_method::mt2203)   \
+                        INSTANTIATE_uniform_without_replacement(double,                            \
+                                                                Size,                              \
+                                                                engine_method::mcg59)              \
+                            INSTANTIATE_uniform_without_replacement(double,                        \
+                                                                    Size,                          \
+                                                                    engine_method::mrg32k3a)       \
+                                INSTANTIATE_uniform_without_replacement(                           \
+                                    double,                                                        \
+                                    Size,                                                          \
+                                    engine_method::philox4x32x10)                                  \
+                                    INSTANTIATE_uniform_without_replacement(                       \
+                                        double,                                                    \
+                                        Size,                                                      \
+                                        engine_method::mt19937)                                    \
+                                        INSTANTIATE_uniform_without_replacement(                   \
+                                            std::int32_t,                                          \
+                                            Size,                                                  \
+                                            engine_method::mt2203)                                 \
+                                            INSTANTIATE_uniform_without_replacement(               \
+                                                std::int32_t,                                      \
+                                                Size,                                              \
+                                                engine_method::mcg59)                              \
+                                                INSTANTIATE_uniform_without_replacement(           \
+                                                    std::int32_t,                                  \
+                                                    Size,                                          \
+                                                    engine_method::mrg32k3a)                       \
+                                                    INSTANTIATE_uniform_without_replacement(       \
+                                                        std::int32_t,                              \
+                                                        Size,                                      \
+                                                        engine_method::philox4x32x10)              \
+                                                        INSTANTIATE_uniform_without_replacement(   \
+                                                            std::int32_t,                          \
+                                                            Size,                                  \
+                                                            engine_method::mt19937)
+
+INSTANTIATE_uniform_without_replacement_FLOAT(std::int64_t);
+INSTANTIATE_uniform_without_replacement_FLOAT(std::int32_t);
+
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void shuffle(sycl::queue& queue,              \
+                                        Size count_,                     \
+                                        F* dst,                          \
+                                        dpc_engine<EngineType>& engine_, \
+                                        const event_vector& deps);
 
 #define INSTANTIATE_SHUFFLE_FLOAT(Size)                                   \
     INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203)        \
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 7e07b65f411..2a079f15466 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -120,8 +120,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     auto rng_engine = this->get_dpc_engine(seed);
     auto rng_engine_ = this->get_dpc_engine(seed);
 
-    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
-    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
 
     this->check_results(arr_gpu, arr_host);
 }
@@ -150,11 +150,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe
     auto rng_engine = this->get_dpc_engine(seed);
     auto rng_engine_2 = this->get_dpc_engine(seed);
 
-    uniform_cpu<Float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<Float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
 
-    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_host_init_1, arr_host_init_2);
     this->check_results(arr_gpu, arr_host);
@@ -181,21 +181,16 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
     auto rng_engine = this->get_dpc_engine(seed);
     auto rng_engine_2 = this->get_dpc_engine(seed);
 
-    uniform_gpu<Float>(this->get_queue(),
-                       elem_count,
-                       arr_device_init_1_ptr,
-                       rng_engine,
-                       0,
-                       elem_count);
-    uniform_gpu<Float>(this->get_queue(),
-                       elem_count,
-                       arr_device_init_2_ptr,
-                       rng_engine_2,
-                       0,
-                       elem_count);
-
-    uniform_gpu<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
-    uniform_cpu<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+    uniform<Float>(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(this->get_queue(),
+                   elem_count,
+                   arr_device_init_2_ptr,
+                   rng_engine_2,
+                   0,
+                   elem_count);
+
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
 
     this->check_results(arr_device_init_1, arr_device_init_2);
     this->check_results(arr_gpu, arr_host);

From 42e00208e8d3113a39176ace7b1e9ae8db2f0933 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Fri, 20 Dec 2024 01:58:17 -0800
Subject: [PATCH 16/18] minor fix

---
 .../dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index b2ebe9f5bdb..b6151ff180a 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -143,7 +143,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         solution_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
-        primitives::engine eng(4014 + n_);
+        primitives::host_engine eng(4014 + n_);
         uniform<float_t>(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));

From 85d9f02321a94c6b893506532dd945e46185cec0 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Fri, 20 Dec 2024 03:38:26 -0800
Subject: [PATCH 17/18] fix

---
 .../dal/backend/primitives/objective_function/test/fixture.hpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index 6cf2b73ccd6..31870cb645f 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
             primitives::host_engine eng(2007 + dim * num_checks + ij);
-            pr::uniform<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
+            primitives::uniform<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);

From 7393a2648d59f62b7e6454388349fbe2edb26bc8 Mon Sep 17 00:00:00 2001
From: Alexandr-Solovev <aleksandr.solovev@intel.com>
Date: Fri, 20 Dec 2024 06:49:37 -0800
Subject: [PATCH 18/18] minor fix

---
 cpp/daal/include/daal_win.h                   |  6 +--
 .../backend/primitives/rng/test/rng_dpc.cpp   | 41 ++-----------------
 2 files changed, 7 insertions(+), 40 deletions(-)

diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h
index a15ed7db26e..6e86076e275 100755
--- a/cpp/daal/include/daal_win.h
+++ b/cpp/daal/include/daal_win.h
@@ -309,13 +309,13 @@
 #include "algorithms/distributions/bernoulli/bernoulli.h"
 #include "algorithms/distributions/bernoulli/bernoulli_types.h"
 #include "algorithms/engines/engine.h"
-#include "algorithms/engines/engine_family.h"
-#include "algorithms/engines/mt2203/mt2203.h"
-#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mt19937/mt19937.h"
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
 #include "algorithms/engines/mcg59/mcg59_types.h"
+#include "algorithms/engines/engine_family.h"
+#include "algorithms/engines/mt2203/mt2203.h"
+#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a.h"
 #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
 #include "algorithms/engines/philox4x32x10/philox4x32x10.h"
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
index 2a079f15466..38c09902046 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -107,6 +107,7 @@ using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k
 
 TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
     SKIP_IF(this->get_policy().is_cpu());
+    SKIP_IF(this->not_float64_friendly());
     using Float = std::tuple_element_t<0, TestType>;
 
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000);
@@ -131,6 +132,7 @@ using rng_types_skip_ahead_support = COMBINE_TYPES((float, double),
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
+    SKIP_IF(this->not_float64_friendly());
     using Float = std::tuple_element_t<0, TestType>;
 
     std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000);
@@ -162,6 +164,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe
 
 TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) {
     SKIP_IF(this->get_policy().is_cpu());
+    SKIP_IF(this->not_float64_friendly());
     using Float = std::tuple_element_t<0, TestType>;
 
     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
@@ -196,42 +199,6 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe
     this->check_results(arr_gpu, arr_host);
 }
 
-//TODO: add engine collection test + host_engine tests
-// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) {
-//     SKIP_IF(this->get_policy().is_cpu());
-//     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
-//     std::int64_t seed = GENERATE_COPY(1, 777, 999);
-
-//     engine_collection<std::int64_t,engine_method::mcg59> collection(this->get_queue(), 2, seed);
-
-//     auto engine_arr = collection.get_dpc_engines();
-
-//     auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count);
-
-//     auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
-//     auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
-
-//     auto rn_gen = this->get_rng();
-
-//     rn_gen.uniform(this->get_queue(),
-//                    elem_count,
-//                    arr_device_init_1_ptr,
-//                    engine_arr[0],
-//                    0,
-//                    elem_count);
-
-//     rn_gen.uniform(this->get_queue(),
-//                    elem_count,
-//                    arr_device_init_2_ptr,
-//                    engine_arr[1],
-//                    0,
-//                    elem_count);
-
-//     // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count);
-//     // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count);
-
-//     //this->check_results_device(arr_device_init_1, arr_device_init_2);
-//     this->check_results(arr_device_init_1, arr_device_init_2);
-// }
+//TODO: add engine collection test + separate host_engine tests
 
 } // namespace oneapi::dal::backend::primitives::test