From 294b2f2749f2311817250801b1c945c6f1fbd5df Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 5 Nov 2024 00:31:13 -0800 Subject: [PATCH 01/18] fixes --- .../vertex_partitioning_default_kernel.hpp | 8 +- .../algo/louvain/backend/cpu/louvain_data.hpp | 6 +- .../vertex_partitioning_default_kernel.hpp | 2 +- .../objective_function/test/fixture.hpp | 8 +- .../objective_function/test/spmd_fixture.hpp | 6 +- .../optimizers/test/cg_solver_dpc.cpp | 8 +- .../primitives/optimizers/test/fixture.hpp | 10 +- .../optimizers/test/newton_cg_dpc.cpp | 14 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 25 ++ .../dal/backend/primitives/rng/rng_cpu.hpp | 105 ++++++ .../dal/backend/primitives/rng/rng_dpc.cpp | 172 ++++++++++ .../primitives/rng/rng_engine_collection.hpp | 77 ++--- .../dal/backend/primitives/rng/rng_gpu.hpp | 220 +++++++++++++ .../backend/primitives/rng/test/rng_dpc.cpp | 300 ++++++++++++++++++ 14 files changed, 873 insertions(+), 88 deletions(-) create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 4da1866e277..218f7da46bc 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -24,7 +24,7 @@ #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/backend/interop/common.hpp" #include "oneapi/dal/table/homogen.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/detail/threading.hpp" namespace oneapi::dal::preview::connected_components::backend { @@ -90,9 +90,9 @@ std::int32_t most_frequent_element(const std::atomic *components, const std::int64_t &samples_count = 1024) { std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); - dal::backend::primitives::engine eng; - dal::backend::primitives::rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_state(), 0, vertex_count); + dal::backend::primitives::daal_engine eng; + dal::backend::primitives::daal_rng rn_gen; + rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_cpu_engine_state(), 0, vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index d21de8c9627..b016a5bf6e9 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -17,7 +17,7 @@ #pragma once #include "oneapi/dal/backend/memory.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::preview::louvain::backend { using namespace oneapi::dal::preview::detail; @@ -123,8 +123,8 @@ struct louvain_data { // Total link weight in the network value_type m; - engine eng; - rng rn_gen; + daal_engine eng; + daal_rng rn_gen; const std::int64_t vertex_count; const std::int64_t edge_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index 79e294e9f47..7b277d88283 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_state(), 0, t._vertex_count); + ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_cpu_engine_state(), 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index fabe919b34e..d673470b042 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -25,7 +25,7 @@ #include "oneapi/dal/table/csr_accessor.hpp" #include "oneapi/dal/detail/debug.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::backend::primitives::test { @@ -572,13 +572,13 @@ class logloss_test : public te::float_algo_fixture rn_gen; + primitives::daal_rng rn_gen; auto vec_host = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { - primitives::engine eng(2007 + dim * num_checks + ij); - rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::daal_engine eng(2007 + dim * num_checks + ij); + rn_gen.uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index e902dd452e1..e2a611c2c98 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test { std::int64_t num_checks = 5; std::vector> vecs_host(num_checks), vecs_gpu(num_checks); - rng rn_gen; + daal_rng rn_gen; for (std::int64_t ij = 0; ij < num_checks; ++ij) { - engine eng(2007 + dim * num_checks + ij); + daal_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); - rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue()); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index ea320f690a2..36e20f03c11 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -20,7 +20,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include namespace oneapi::dal::backend::primitives::test { @@ -43,9 +43,9 @@ class cg_solver_test : public te::float_algo_fixture { x_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, x_host_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(4014 + n_); + rn_gen.uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index a6b87b2dcc1..777c0ee68e2 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -21,7 +21,7 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/blas/gemv.hpp" #include "oneapi/dal/backend/primitives/element_wise.hpp" @@ -133,11 +133,11 @@ void create_stable_matrix(sycl::queue& queue, ONEDAL_ASSERT(A.get_dimension(1) == n); auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(2007 + n); - rn_gen.uniform(n * n, J.get_mutable_data(), eng.get_state(), -1.0, 1.0); - rn_gen.uniform(n, eigen_values.get_mutable_data(), eng.get_state(), bottom_eig, top_eig); + rn_gen.uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index f473dddf1f7..d4f5ea55fb9 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -22,7 +22,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include #include "oneapi/dal/backend/primitives/objective_function.hpp" @@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host); auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); - rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng.get_state(), -10.0, 10.0); - rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng.get_state(), -5.0, 5.0); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(2007 + n); + rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture { auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::rng rn_gen; primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp new file mode 100644 index 00000000000..a89ca3d4505 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -0,0 +1,25 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp" + +#ifdef ONEDAL_DATA_PARALLEL + +#include "oneapi/dal/backend/primitives/rng/rng_gpu.hpp" + +#endif diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp new file mode 100644 index 00000000000..a692070551e --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -0,0 +1,105 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +namespace oneapi::dal::backend::primitives { + +enum class engine_list_cpu { mt2203, mcg59, mt19937 }; + +template +class daal_engine { +public: + explicit daal_engine(std::int64_t seed = 777) + : daal_engine_(initialize_daal_engine(seed)), + impl_(dynamic_cast( + daal_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + virtual ~daal_engine() = default; + + void* get_cpu_engine_state() const { + return impl_->getState(); + } + + auto& get_cpu_engine() { + return daal_engine_; + } + +private: + daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + switch (EngineType) { + case engine_list_cpu::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_list_cpu::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list_cpu::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + daal::algorithms::engines::EnginePtr daal_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +template +class daal_rng { +public: + daal_rng() = default; + ~daal_rng() = default; + + void uniform(Size count, Type* dst, void* state, Type a, Type b) { + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + } + + void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + void* state, + Type a, + Type b) { + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } + + template >> + void shuffle(Size count, Type* dst, void* state) { + Type idx[2]; + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } +}; + +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp new file mode 100644 index 00000000000..79b5418d9d8 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -0,0 +1,172 @@ +/******************************************************************************* +* Copyright 2022 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" + +namespace oneapi::dal::backend::primitives { + +namespace bk = oneapi::dal::backend; + +template +template +void oneapi_rng::uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + oneapi::mkl::rng::uniform distr(a, b); + auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps }); + event.wait_and_throw(); + engine_.skip_ahead_cpu(count); +} + +template +template +void oneapi_rng::uniform_cpu(Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +} + +template +template +void oneapi_rng::uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +template +template +void oneapi_rng::shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + const event_vector& deps) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } +} + +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_list::mt2203) \ + INSTANTIATE_(float, Size, engine_list::mcg59) \ + INSTANTIATE_(float, Size, engine_list::mt19937) \ + INSTANTIATE_(double, Size, engine_list::mt2203) \ + INSTANTIATE_(double, Size, engine_list::mcg59) \ + INSTANTIATE_(double, Size, engine_list::mt19937) \ + INSTANTIATE_(int, Size, engine_list::mt2203) \ + INSTANTIATE_(int, Size, engine_list::mcg59) \ + INSTANTIATE_(int, Size, engine_list::mt19937) + +INSTANTIATE_FLOAT_(std::int64_t); +INSTANTIATE_FLOAT_(std::int32_t); + +#define INSTANTIATE_CPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_cpu( \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + F a, \ + F b); + +#define INSTANTIATE_FLOAT_CPU(Size) \ + INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(int, Size, engine_list::mt19937) + +INSTANTIATE_FLOAT_CPU(std::int64_t); +INSTANTIATE_FLOAT_CPU(std::int32_t); + +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_without_replacement_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + oneapi_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) + +INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); +INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); + +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::shuffle_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + const event_vector& deps); + +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) + +INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); +INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); + +} // namespace oneapi::dal::backend::primitives \ No newline at end of file diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 09a5a589141..81ce6bf852b 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -16,78 +16,41 @@ #pragma once -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" - +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" #include +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/table/common.hpp" + namespace oneapi::dal::backend::primitives { -template +#ifdef ONEDAL_DATA_PARALLEL + +template class engine_collection { public: - explicit engine_collection(Size count, std::int64_t seed = 777) + engine_collection(sycl::queue& queue, Size count, std::int64_t seed = 777) : count_(count), - engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)), - params_(count), - technique_(daal::algorithms::engines::internal::family), - daal_engine_list_(count) {} - - template - std::vector operator()(Op&& op) { - daal::services::Status status; - for (Size i = 0; i < count_; ++i) { - op(i, params_.nSkip[i]); - } - select_parallelization_technique(technique_); - daal::algorithms::engines::internal::EnginesCollection engine_collection( - engine_, - technique_, - params_, - daal_engine_list_, - &status); - if (!status) { - dal::backend::interop::status_to_exception(status); - } - - std::vector engine_list(count_); + seed_(seed) { + engines_.reserve(count_); for (Size i = 0; i < count_; ++i) { - engine_list[i] = daal_engine_list_[i]; + engines_.push_back(oneapi_engine(queue, seed_)); } - - //copy elision - return engine_list; } -private: - void select_parallelization_technique( - daal::algorithms::engines::internal::ParallelizationTechnique& technique) { - auto daal_engine_impl = - dynamic_cast(engine_.get()); - - daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = { - daal::algorithms::engines::internal::family, - daal::algorithms::engines::internal::leapfrog, - daal::algorithms::engines::internal::skipahead - }; - - for (auto& techn : techniques) { - if (daal_engine_impl->hasSupport(techn)) { - technique = techn; - return; - } - } - - throw domain_error( - dal::detail::error_messages::rng_engine_does_not_support_parallelization_techniques()); + std::vector> get_engines() const { + return engines_; } private: Size count_; - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::Params params_; - daal::algorithms::engines::internal::ParallelizationTechnique technique_; - daal::services::internal::TArray - daal_engine_list_; + std::int64_t seed_; + std::vector> engines_; }; +#endif } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp new file mode 100644 index 00000000000..6463534caad --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp @@ -0,0 +1,220 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include +namespace mkl = oneapi::mkl; +namespace oneapi::dal::backend::primitives { + +#ifdef ONEDAL_DATA_PARALLEL + +enum class engine_list { mt2203, mcg59, mt19937 }; + +template +struct oneapi_engine_type; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mt2203; +}; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mcg59; +}; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mt19937; +}; + +template +class oneapi_engine { +public: + using onedal_engine_t = typename oneapi_engine_type::type; + + explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777) + : q(queue), + daal_engine_(initialize_daal_engine(seed)), + onedal_engine_(initialize_oneapi_engine(queue, seed)), + impl_(dynamic_cast( + daal_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + virtual ~oneapi_engine() = default; + + void* get_cpu_engine_state() const { + return impl_->getState(); + } + + auto& get_cpu_engine() { + return daal_engine_; + } + + auto& get_gpu_engine() { + return onedal_engine_; + } + + void skip_ahead_cpu(size_t nSkip) { + daal_engine_->skipAhead(nSkip); + } + + void skip_ahead_gpu(size_t nSkip) { + if constexpr (EngineType == engine_list::mt2203) { + } + else { + skip_ahead(onedal_engine_, nSkip); + } + } + +private: + daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + switch (EngineType) { + case engine_list::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { + if constexpr (EngineType == engine_list::mt2203) { + return onedal_engine_t(queue, seed, + 0); // Aligns CPU and GPU results for mt2203 + } + else { + return onedal_engine_t(queue, seed); + } + } + sycl::queue q; + daal::algorithms::engines::EnginePtr daal_engine_; + onedal_engine_t onedal_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +template +class oneapi_rng { +public: + oneapi_rng() = default; + ~oneapi_rng() = default; + + template + void uniform(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + bool distr_mode = false, + const event_vector& deps = {}); + + template + void uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + + template + void uniform_cpu(Size count, Type* dst, oneapi_engine& engine_, Type a, Type b); + template + void uniform_without_replacement(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}) {} + + template + void uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buff, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + + template + void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + oneapi_engine& engine_, + Type a, + Type b) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } + + template >> + void shuffle(Size count, Type* dst, oneapi_engine& engine_) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } + + template + void shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + const event_vector& deps); + + template >> + void shuffle_cpu(Size count, Type* dst, oneapi_engine& engine_) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } +}; + +#endif +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp new file mode 100644 index 00000000000..8a69f109162 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -0,0 +1,300 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/test/engine/common.hpp" +#include "oneapi/dal/test/engine/fixtures.hpp" +#include "oneapi/dal/test/engine/dataframe.hpp" + +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" +namespace oneapi::dal::backend::primitives::test { + +namespace te = dal::test::engine; + +class mt2203 {}; +class mcg59 {}; +class mt19937 {}; + +template +struct engine_map {}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mt2203; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mcg59; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mt19937; +}; + +template +constexpr auto engine_v = engine_map::value; + +template +class rng_test : public te::policy_fixture { +public: + using Index = std::tuple_element_t<0, TestType>; + using EngineType = std::tuple_element_t<1, TestType>; + static constexpr auto engine_qq = engine_v; + + auto get_rng() const { + oneapi_rng rn_gen; + return rn_gen; + } + + auto get_engine(std::int64_t seed) { + auto rng_engine = oneapi_engine(this->get_queue(), seed); + return rng_engine; + } + + auto allocate_arrays(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + auto val_host = ndarray::empty({ elem_count }); + + return std::make_tuple(val_gpu, val_host); + } + + auto allocate_arrays_shared(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); + auto val_host = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); + + return std::make_tuple(val_gpu, val_host); + } + + auto allocate_arrays_device(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu_1 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + auto val_gpu_2 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + + return std::make_tuple(val_gpu_1, val_gpu_2); + } + + auto allocate_arrays_host(std::int64_t elem_count) { + auto val_host_1 = ndarray::empty({ elem_count }); + auto val_host_2 = ndarray::empty({ elem_count }); + + return std::make_tuple(val_host_1, val_host_2); + } + + void check_results_host(const ndarray& val_host_1, + const ndarray& val_host_2) { + const Index* val_host_1_ptr = val_host_1.get_data(); + + const Index* val_host_2_ptr = val_host_2.get_data(); + + for (std::int64_t el = 0; el < val_host_1.get_count(); el++) { + REQUIRE(val_host_1_ptr[el] == val_host_2_ptr[el]); + } + } + + void check_results_device(const ndarray& val_gpu_1, + const ndarray& val_gpu_2) { + const auto val_gpu_host_1 = val_gpu_1.to_host(this->get_queue()); + const Index* val_gpu_host_1_ptr = val_gpu_host_1.get_data(); + + const auto val_gpu_host_2 = val_gpu_2.to_host(this->get_queue()); + const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data(); + + for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) { + REQUIRE(val_gpu_host_2_ptr[el] == val_gpu_host_1_ptr[el]); + } + } + + void check_results(const ndarray& val_gpu, const ndarray& val_host) { + const Index* val_host_ptr = val_host.get_data(); + + const auto val_gpu_host = val_gpu.to_host(this->get_queue()); + const Index* val_gpu_host_ptr = val_gpu_host.get_data(); + + for (std::int64_t el = 0; el < val_host.get_count(); el++) { + REQUIRE(val_gpu_host_ptr[el] == val_host_ptr[el]); + } + } +}; + +using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59)); + +TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { + SKIP_IF(this->get_policy().is_cpu()); + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); + std::int64_t seed = GENERATE_COPY(777, 999); + + auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rn_gen = this->get_rng(); + auto rng_engine = this->get_engine(seed); + auto rng_engine_ = this->get_engine(seed); + + rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); + rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + + this->check_results(arr_gpu, arr_host); +} + +using rng_types_skip = COMBINE_TYPES((float), (mcg59)); + +// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = +// GENERATE_COPY(10, 1000, 300000, 15000, 1000000, 100000000, 6100000000, 1LL * 64 * 1000000); +// std::int64_t seed = GENERATE_COPY(777); + +// auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); +// auto arr_gpu_ptr = arr_gpu.get_mutable_data(); +// auto arr_host_ptr = arr_host.get_mutable_data(); + +// auto rn_gen = this->get_rng(); +// auto rng_engine = this->get_engine(seed); +// auto rng_engine_ = this->get_engine(seed); + +// BENCHMARK("Uniform dispatcher HOST arr" + std::to_string(elem_count)) { +// rn_gen.uniform(this->get_queue(), elem_count, arr_host_ptr, rng_engine, 0, elem_count); +// }; +// BENCHMARK("Uniform dispatcher GPU arr" + std::to_string(elem_count)) { +// rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); +// }; + +// auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count); +// auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data(); +// auto arr_host_ptr_ = arr_host_.get_mutable_data(); + +// auto rn_gen_ = this->get_rng(); +// auto rng_engine_1 = this->get_engine(seed); +// auto rng_engine_2 = this->get_engine(seed); +// BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { +// rn_gen_.uniform_gpu(this->get_queue(), +// elem_count, +// arr_gpu_ptr_, +// rng_engine_1, +// 0, +// elem_count); +// }; + +// BENCHMARK("Uniform HOST arr" + std::to_string(elem_count)) { +// rn_gen_.uniform(elem_count, arr_host_ptr_, rng_engine_2, 0, elem_count); +// }; +// } + +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) { + SKIP_IF(this->get_policy().is_cpu()); + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); + std::int64_t seed = GENERATE_COPY(777, 999); + + auto [arr_host_init_1, arr_host_init_2] = this->allocate_arrays_host(elem_count); + auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data(); + auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data(); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rn_gen = this->get_rng(); + auto rng_engine = this->get_engine(seed); + auto rng_engine_2 = this->get_engine(seed); + + rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); + rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + + rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + + this->check_results_host(arr_host_init_1, arr_host_init_2); + this->check_results(arr_gpu, arr_host); +} + +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) { + SKIP_IF(this->get_policy().is_cpu()); + std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); + std::int64_t seed = GENERATE_COPY(1, 777, 999); + + auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_device(elem_count); + auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); + auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rn_gen = this->get_rng(); + auto rng_engine = this->get_engine(seed); + auto rng_engine_2 = this->get_engine(seed); + + rn_gen.uniform_gpu(this->get_queue(), + elem_count, + arr_device_init_1_ptr, + rng_engine, + 0, + elem_count); + rn_gen.uniform_gpu(this->get_queue(), + elem_count, + arr_device_init_2_ptr, + rng_engine_2, + 0, + elem_count); + + rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + + this->check_results_device(arr_device_init_1, arr_device_init_2); + this->check_results(arr_gpu, arr_host); +} + +// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); +// std::int64_t seed = GENERATE_COPY(1, 777, 999); + +// engine_collection collection(this->get_queue(), 2, seed); + +// auto engine_arr = collection.get_engines(); + +// auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count); + +// auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); +// auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); + +// auto rn_gen = this->get_rng(); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_1_ptr, +// engine_arr[0], +// 0, +// elem_count); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_2_ptr, +// engine_arr[1], +// 0, +// elem_count); + +// // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count); +// // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count); + +// //this->check_results_device(arr_device_init_1, arr_device_init_2); +// this->check_results(arr_device_init_1, arr_device_init_2); +// } + +} // namespace oneapi::dal::backend::primitives::test From 81d7dfe7100a714152fe3203d5c193796ed1a68f Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 5 Nov 2024 07:12:50 -0800 Subject: [PATCH 02/18] minor fixes --- .../backend/gpu/train_kernel_hist_impl.hpp | 2 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 23 +++--- .../dal/backend/primitives/rng/rng_cpu.hpp | 17 +++++ .../primitives/rng/rng_engine_collection.hpp | 72 ++++++++++++++++++- 4 files changed, 100 insertions(+), 14 deletions(-) diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 6d1c4362309..84e1d8f620f 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -50,7 +50,7 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::engine; + using rng_engine_t = pr::daal_engine; using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 9fac38d25b0..10197bf0c43 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -396,12 +396,12 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const node_list_ptr = node_list_host.get_mutable_data(); for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - pr::rng rn_gen; + pr::daal_rng rn_gen; Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; rn_gen.uniform(ctx.selected_row_total_count_, gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx].get_state(), + rng_engine_list[engine_offset + node_idx].get_cpu_engine_state(), 0, ctx.row_total_count_); @@ -483,15 +483,15 @@ train_kernel_hist_impl::gen_feature_list( auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_); - pr::rng rn_gen; + pr::daal_rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform_without_replacement( + rn_gen.uniform_without_replacement_cpu( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), + rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(), 0, ctx.column_count_); } @@ -524,7 +524,7 @@ train_kernel_hist_impl::gen_random_thresholds( auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_); - pr::rng rn_gen; + pr::daal_rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); // Create arrays for random generated bins @@ -539,7 +539,7 @@ train_kernel_hist_impl::gen_random_thresholds( for (Index node = 0; node < node_count; ++node) { rn_gen.uniform(ctx.selected_ftr_count_, random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), + rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(), 0.0f, 1.0f); } @@ -1660,12 +1660,13 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); - pr::rng rn_gen; + pr::daal_rng rn_gen; for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block].get_state()); + rn_gen.shuffle( + oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state()); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index a692070551e..7ea7ae9266d 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -40,6 +40,23 @@ class daal_engine { } } + explicit daal_engine(const daal::algorithms::engines::EnginePtr& eng) : daal_engine_(eng) { + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + } + + daal_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { + daal_engine_ = eng; + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + + return *this; + } + virtual ~daal_engine() = default; void* get_cpu_engine_state() const { diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 81ce6bf852b..1d058be6025 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -30,10 +30,78 @@ namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -template +template class engine_collection { public: - engine_collection(sycl::queue& queue, Size count, std::int64_t seed = 777) + explicit engine_collection(Size count, std::int64_t seed = 777) + : count_(count), + engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)), + params_(count), + technique_(daal::algorithms::engines::internal::family), + daal_engine_list_(count) {} + + template + std::vector> operator()(Op&& op) { + daal::services::Status status; + for (Size i = 0; i < count_; ++i) { + op(i, params_.nSkip[i]); + } + select_parallelization_technique(technique_); + daal::algorithms::engines::internal::EnginesCollection engine_collection( + engine_, + technique_, + params_, + daal_engine_list_, + &status); + if (!status) { + dal::backend::interop::status_to_exception(status); + } + + std::vector> engine_list(count_); + for (Size i = 0; i < count_; ++i) { + engine_list[i] = daal_engine_list_[i]; + } + + //copy elision + return engine_list; + } + +private: + void select_parallelization_technique( + daal::algorithms::engines::internal::ParallelizationTechnique& technique) { + auto daal_engine_impl = + dynamic_cast(engine_.get()); + + daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = { + daal::algorithms::engines::internal::family, + daal::algorithms::engines::internal::leapfrog, + daal::algorithms::engines::internal::skipahead + }; + + for (auto& techn : techniques) { + if (daal_engine_impl->hasSupport(techn)) { + technique = techn; + return; + } + } + + throw domain_error( + dal::detail::error_messages::rng_engine_does_not_support_parallelization_techniques()); + } + +private: + Size count_; + daal::algorithms::engines::EnginePtr engine_; + daal::algorithms::engines::internal::Params params_; + daal::algorithms::engines::internal::ParallelizationTechnique technique_; + daal::services::internal::TArray + daal_engine_list_; +}; + +template +class engine_collection_oneapi { +public: + engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777) : count_(count), seed_(seed) { engines_.reserve(count_); From acb6e4cb237c7e0b4d6d9034a7267640679bc170 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Wed, 13 Nov 2024 04:40:47 -0800 Subject: [PATCH 03/18] adding mrg32k3a engine --- .../algorithms/engines/mrg32k3a/mrg32k3a.h | 182 ++++++++++++++++++ .../engines/mrg32k3a/mrg32k3a_types.h | 64 ++++++ cpp/daal/include/daal.h | 2 + cpp/daal/include/daal_win.h | 2 + .../algorithms/engines/mrg32k3a/mrg32k3a.cpp | 58 ++++++ .../mrg32k3a/mrg32k3a_batch_container.h | 68 +++++++ .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 116 +++++++++++ .../mrg32k3a_dense_default_batch_fpt_cpu.cpp | 47 +++++ ...k3a_dense_default_batch_fpt_dispatcher.cpp | 30 +++ .../engines/mrg32k3a/mrg32k3a_impl.i | 49 +++++ .../engines/mrg32k3a/mrg32k3a_kernel.h | 58 ++++++ .../algorithms/engines/mt2203/mt2203_kernel.h | 4 +- cpp/daal/src/externals/service_rng_mkl.h | 1 + cpp/daal/src/externals/service_rng_openrng.h | 1 + cpp/daal/src/externals/service_rng_ref.h | 3 +- .../dal/backend/primitives/rng/rng_cpu.hpp | 5 +- .../dal/backend/primitives/rng/rng_dpc.cpp | 10 + .../primitives/rng/rng_engine_collection.hpp | 1 + .../dal/backend/primitives/rng/rng_gpu.hpp | 9 +- .../backend/primitives/rng/test/rng_dpc.cpp | 10 +- docs/source/daal/algorithms/engines/index.rst | 1 + .../daal/algorithms/engines/mrg32k3a.rst | 63 ++++++ makefile.lst | 4 +- 23 files changed, 780 insertions(+), 8 deletions(-) create mode 100644 cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h create mode 100644 cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i create mode 100644 cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h create mode 100644 docs/source/daal/algorithms/engines/mrg32k3a.rst diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h new file mode 100644 index 00000000000..df6c1edf414 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -0,0 +1,182 @@ +/* file: mrg32k3a.h */ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the Mersenne Twister engine in the batch processing mode +//-- +*/ + +#ifndef __MRG32K3A_H__ +#define __MRG32K3A_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/engine.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +/** + * @defgroup engines_mrg32k3a_batch Batch + * @ingroup engines_mrg32k3a + * @{ + */ +namespace interface1 +{ +/** + * + * \brief Provides methods to run implementations of the mrg32k3a engine. + * This class is associated with the \ref mrg32k3a::interface1::Batch "mrg32k3a::Batch" class + * and supports the method of mrg32k3a engine computation in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of mrg32k3a engine, double or float + * \tparam method Computation method of the engine, mrg32k3a::Method + * \tparam cpu Version of the cpu-specific implementation of the engine, daal::CpuType + */ +template +class BatchContainer : public daal::algorithms::AnalysisContainerIface +{ +public: + /** + * Constructs a container for the mrg32k3a engine with a specified environment + * in the batch processing mode + * \param[in] daalEnv Environment object + */ + BatchContainer(daal::services::Environment::env * daalEnv); + ~BatchContainer(); + /** + * Computes the result of the mrg32k3a engine in the batch processing mode + * + * \return Status of computations + */ + services::Status compute() DAAL_C11_OVERRIDE; +}; + +/** + * + * \brief Provides methods for mrg32k3a engine computations in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of mrg32k3a engine, double or float + * \tparam method Computation method of the engine, mrg32k3a::Method + * + * \par Enumerations + * - mrg32k3a::Method Computation methods for the mrg32k3a engine + * + * \par References + * - \ref engines::interface1::Input "engines::Input" class + * - \ref engines::interface1::Result "engines::Result" class + */ +template +class DAAL_EXPORT Batch : public engines::BatchBase +{ +public: + typedef engines::BatchBase super; + + typedef typename super::InputType InputType; + typedef typename super::ResultType ResultType; + + /** + * Creates mrg32k3a engine + * \param[in] seed Initial condition for mrg32k3a engine + * + * \return Pointer to mrg32k3a engine + */ + static services::SharedPtr > create(size_t seed = 777); + + /** + * Returns method of the engine + * \return Method of the engine + */ + virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; } + + /** + * Returns the structure that contains results of mrg32k3a engine + * \return Structure that contains results of mrg32k3a engine + */ + ResultPtr getResult() { return _result; } + + /** + * Registers user-allocated memory to store results of mrg32k3a engine + * \param[in] result Structure to store results of mrg32k3a engine + * + * \return Status of computations + */ + services::Status setResult(const ResultPtr & result) + { + DAAL_CHECK(result, services::ErrorNullResult) + _result = result; + _res = _result.get(); + return services::Status(); + } + + /** + * Returns a pointer to the newly allocated mrg32k3a engine + * with a copy of input objects and parameters of this mrg32k3a engine + * \return Pointer to the newly allocated engine + */ + services::SharedPtr > clone() const { return services::SharedPtr >(cloneImpl()); } + + /** + * Allocates memory to store the result of the mrg32k3a engine + * + * \return Status of computations + */ + virtual services::Status allocateResult() DAAL_C11_OVERRIDE + { + services::Status s = this->_result->template allocate(&(this->input), NULL, (int)method); + this->_res = this->_result.get(); + return s; + } + +protected: + Batch(size_t seed = 777) { initialize(); } + + Batch(const Batch & other) : super(other) { initialize(); } + + virtual Batch * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch(*this); } + + void initialize() + { + Analysis::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env); + _in = &input; + _result.reset(new ResultType()); + } + +private: + ResultPtr _result; + + Batch & operator=(const Batch &); +}; +typedef services::SharedPtr > mrg32k3aPtr; +typedef services::SharedPtr > mrg32k3aConstPtr; + +} // namespace interface1 +using interface1::BatchContainer; +using interface1::Batch; +using interface1::mrg32k3aPtr; +using interface1::mrg32k3aConstPtr; +/** @} */ +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal +#endif diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h new file mode 100644 index 00000000000..77ca9656418 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h @@ -0,0 +1,64 @@ +/* file: mrg32k3a_types.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of mrg32k3a engine. +//-- +*/ + +#ifndef __MRG32K3A_TYPES_H__ +#define __MRG32K3A_TYPES_H__ + +#include "algorithms/algorithm.h" +#include "services/daal_defines.h" +#include "data_management/data/numeric_table.h" +#include "data_management/data/homogen_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +/** + * @defgroup engines_mrg32k3a mrg32k3a Engine + * \copydoc daal::algorithms::engines::mrg32k3a + * @ingroup engines + * @{ + */ +/** + * \brief Contains classes for mrg32k3a engine + */ +namespace mrg32k3a +{ +/** + * + * Available methods to compute mrg32k3a engine + */ +enum Method +{ + defaultDense = 0 /*!< Default: performance-oriented method. */ +}; + +} // namespace mrg32k3a +/** @} */ +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h index 881a6c39fbe..443f237e051 100755 --- a/cpp/daal/include/daal.h +++ b/cpp/daal/include/daal.h @@ -301,6 +301,8 @@ #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" #include "algorithms/engines/mcg59/mcg59_types.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" #include "algorithms/engines/engine_family.h" #include "algorithms/engines/mt2203/mt2203.h" #include "algorithms/engines/mt2203/mt2203_types.h" diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h index e17eff16796..3e64c38660f 100755 --- a/cpp/daal/include/daal_win.h +++ b/cpp/daal/include/daal_win.h @@ -313,6 +313,8 @@ #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" #include "algorithms/engines/mcg59/mcg59_types.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" #include "algorithms/engines/engine_family.h" #include "algorithms/engines/mt2203/mt2203.h" #include "algorithms/engines/mt2203/mt2203_types.h" diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp new file mode 100644 index 00000000000..288cb0506ee --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp @@ -0,0 +1,58 @@ +/* file: mrg32k3a.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of mrg32k3a engine +//-- + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/externals/service_dispatch.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace interface1 +{ +using namespace daal::services; +using namespace mrg32k3a::internal; + +template +SharedPtr > Batch::create(size_t seed) +{ + SharedPtr > engPtr; +#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl(__VA_ARGS__)); + + DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed); + +#undef DAAL_CREATE_ENGINE_CPU + return engPtr; +} + +template class Batch; +template class Batch; + +} // namespace interface1 +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h new file mode 100644 index 00000000000..1fb8f9ca991 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h @@ -0,0 +1,68 @@ +/* file: mrg32k3a_batch_container.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of mrg32k3a calculation algorithm container. +//-- +*/ + +#ifndef __mrg32k3a_BATCH_CONTAINER_H__ +#define __mrg32k3a_BATCH_CONTAINER_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace interface1 +{ +template +BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface(daalEnv) +{ + __DAAL_INITIALIZE_KERNELS(internal::mrg32k3aKernel, algorithmFPType, method); +} + +template +BatchContainer::~BatchContainer() +{ + __DAAL_DEINITIALIZE_KERNELS(); +} + +template +services::Status BatchContainer::compute() +{ + daal::services::Environment::env & env = *_env; + engines::Result * result = static_cast(_res); + NumericTable * resultTable = result->get(engines::randomNumbers).get(); + + __DAAL_CALL_KERNEL(env, internal::mrg32k3aKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable); +} + +} // namespace interface1 +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h new file mode 100644 index 00000000000..07dc07b9b3a --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -0,0 +1,116 @@ +/* file: mrg32k3a_batch_impl.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the class defining the mrg32k3a engine +//-- +*/ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/engines/engine_batch_impl.h" +#include "src/externals/service_rng.h" +#include "src/data_management/service_numeric_table.h" + +static const int leapfrogMethodErrcode = -1002; +static const int skipAheadMethodErrcode = -1003; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +template +class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch, public algorithms::engines::internal::BatchBaseImpl +{ +public: + typedef algorithms::engines::mrg32k3a::interface1::Batch super1; + typedef algorithms::engines::internal::BatchBaseImpl super2; + BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_mrg32k3a), super2(seed) {} + + void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); } + + int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); } + + services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE + { + int errcode = baseRng.leapfrog(threadNum, nThreads); + services::Status s; + if (errcode == leapfrogMethodErrcode) + s.add(ErrorLeapfrogUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE + { + int errcode = baseRng.skipAhead(nSkip); + services::Status s; + if (errcode == skipAheadMethodErrcode) + s.add(ErrorSkipAheadUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + virtual BatchImpl * cloneImpl() const DAAL_C11_OVERRIDE + { + return new BatchImpl(*this); + } + + bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE + { + switch (technique) + { + case engines::internal::family: return false; + case engines::internal::skipahead: return true; + case engines::internal::leapfrog: return true; + } + return false; + } + + ~BatchImpl() {} + +protected: + BatchImpl(const BatchImpl & other) : super1(other), super2(other), baseRng(other.baseRng) {} + + daal::internal::BaseRNGsInst baseRng; +}; + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp new file mode 100644 index 00000000000..2af52dd0443 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp @@ -0,0 +1,47 @@ +/* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of mrg32k3a calculation functions. +//-- + +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace interface1 +{ +template class BatchContainer; +} // namespace interface1 + +namespace internal +{ +template class mrg32k3aKernel; +} // namespace internal + +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp new file mode 100644 index 00000000000..482486e243f --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp @@ -0,0 +1,30 @@ +/* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of mrg32k3a calculation algorithm dispatcher. +//-- + +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h" + +namespace daal +{ +namespace algorithms +{ +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::mrg32k3a::BatchContainer, batch, DAAL_FPTYPE, engines::mrg32k3a::defaultDense) +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i new file mode 100644 index 00000000000..5e359ecaaa3 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i @@ -0,0 +1,49 @@ +/* file: mrg32k3a_impl.i */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of mrg32k3a algorithm +//-- +*/ + +#ifndef __mrg32k3a_IMPL_I__ +#define __mrg32k3a_IMPL_I__ + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +template +Status mrg32k3aKernel::compute(NumericTable * resultTensor) +{ + return Status(); +} + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h new file mode 100644 index 00000000000..3959576ccbe --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h @@ -0,0 +1,58 @@ +/* file: mrg32k3a_kernel.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Declaration of template function that calculate mrg32k3as. +//-- + +#ifndef __mrg32k3a_KERNEL_H__ +#define __mrg32k3a_KERNEL_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/kernel.h" +#include "data_management/data/numeric_table.h" + +using namespace daal::services; +using namespace daal::data_management; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +/** + * \brief Kernel for mrg32k3a calculation + */ +template +class mrg32k3aKernel : public Kernel +{ +public: + Status compute(NumericTable * resultTable); +}; + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h index b7de119367f..e588a02c8fb 100644 --- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h +++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h @@ -19,8 +19,8 @@ // Declaration of template function that calculate mt2203s. //-- -#ifndef __MCG59_KERNEL_H__ -#define __MCG59_KERNEL_H__ +#ifndef __MT2203_KERNEL_H__ +#define __MT2203_KERNEL_H__ #include "algorithms/engines/mt2203/mt2203.h" #include "src/algorithms/kernel.h" diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h index b2dcd81b78b..a911b2e5d8d 100644 --- a/cpp/daal/src/externals/service_rng_mkl.h +++ b/cpp/daal/src/externals/service_rng_mkl.h @@ -32,6 +32,7 @@ #define __DAAL_BRNG_MT2203 VSL_BRNG_MT2203 #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 +#define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h index dd70c644606..96c567b7366 100644 --- a/cpp/daal/src/externals/service_rng_openrng.h +++ b/cpp/daal/src/externals/service_rng_openrng.h @@ -25,6 +25,7 @@ #define __DAAL_BRNG_MT2203 VSL_BRNG_MT2203 #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 +#define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h index fc56fcf6205..7eafa70fb43 100644 --- a/cpp/daal/src/externals/service_rng_ref.h +++ b/cpp/daal/src/externals/service_rng_ref.h @@ -39,7 +39,8 @@ #define __DAAL_BRNG_MT2203 (1 << 20) * 9 //VSL_BRNG_MT2203 #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 - + //tmp + #define __DAAL_BRNG_MRG32K3A (1 << 20) * 4 //VSL_BRNG_MRG32K3A #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF 0 //VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index 7ea7ae9266d..b9488da808b 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include #include @@ -26,7 +27,7 @@ #include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::backend::primitives { -enum class engine_list_cpu { mt2203, mcg59, mt19937 }; +enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a }; template class daal_engine { @@ -74,6 +75,8 @@ class daal_engine { return daal::algorithms::engines::mt2203::Batch<>::create(seed); case engine_list_cpu::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list_cpu::mrg32k3a: + return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); case engine_list_cpu::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 79b5418d9d8..82ff48edab9 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -95,12 +95,15 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_list::mt2203) \ INSTANTIATE_(float, Size, engine_list::mcg59) \ + INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_(float, Size, engine_list::mt19937) \ INSTANTIATE_(double, Size, engine_list::mt2203) \ INSTANTIATE_(double, Size, engine_list::mcg59) \ + INSTANTIATE_(double, Size, engine_list::mrg32k3a) \ INSTANTIATE_(double, Size, engine_list::mt19937) \ INSTANTIATE_(int, Size, engine_list::mt2203) \ INSTANTIATE_(int, Size, engine_list::mcg59) \ + INSTANTIATE_(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_(std::int64_t); @@ -117,12 +120,15 @@ INSTANTIATE_FLOAT_(std::int32_t); #define INSTANTIATE_FLOAT_CPU(Size) \ INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a) \ INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_CPU(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_CPU(std::int64_t); @@ -142,12 +148,15 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); @@ -164,6 +173,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); #define INSTANTIATE_SHUFFLE_FLOAT(Size) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 1d058be6025..dd7bffd68d7 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include "oneapi/dal/table/common.hpp" diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp index 6463534caad..9cf27a2f4ee 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include @@ -26,7 +27,7 @@ namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -enum class engine_list { mt2203, mcg59, mt19937 }; +enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a}; template struct oneapi_engine_type; @@ -46,6 +47,11 @@ struct oneapi_engine_type { using type = oneapi::mkl::rng::mt19937; }; +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mrg32k3a; +}; + template class oneapi_engine { public: @@ -94,6 +100,7 @@ class oneapi_engine { case engine_list::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); case engine_list::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 8a69f109162..6219b8c32fe 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -26,6 +26,7 @@ namespace te = dal::test::engine; class mt2203 {}; class mcg59 {}; +class mrg32k3a {}; class mt19937 {}; template @@ -41,6 +42,11 @@ struct engine_map { constexpr static auto value = engine_list::mcg59; }; +template <> +struct engine_map { + constexpr static auto value = engine_list::mrg32k3a; +}; + template <> struct engine_map { constexpr static auto value = engine_list::mt19937; @@ -133,7 +139,7 @@ class rng_test : public te::policy_fixture { } }; -using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59)); +using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a)); TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { SKIP_IF(this->get_policy().is_cpu()); @@ -154,7 +160,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { this->check_results(arr_gpu, arr_host); } -using rng_types_skip = COMBINE_TYPES((float), (mcg59)); +using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a)); // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst index e73aef6d991..1c476178dc9 100644 --- a/docs/source/daal/algorithms/engines/index.rst +++ b/docs/source/daal/algorithms/engines/index.rst @@ -113,4 +113,5 @@ These methods are represented with member functions of classes that represent fu mt19937.rst mcg59.rst + mrg32k3a.rst mt2203.rst diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst new file mode 100644 index 00000000000..ce8ca0ec0cc --- /dev/null +++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst @@ -0,0 +1,63 @@ +.. ****************************************************************************** +.. * Copyright 2020 Intel Corporation +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +mrg32k3a +======== + +The engine is based on the 59-bit multiplicative congruential generator. + +.. rubric:: Subsequence selection methods support + +skipAhead (nskip) + Supported +leapfrog (threadIdx, nThreads) + Supported + +Batch Processing +**************** + +mrg32k3a engine needs the initial condition (``seed``) for state initialization. +The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors. + +.. rubric:: Algorithm Parameters + +mrg32k3a engine has the following parameters: + +.. tabularcolumns:: |\Y{0.2}|\Y{0.2}|\Y{0.6}| + +.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing) + :header-rows: 1 + :widths: 10 20 30 + :align: left + :class: longtable + + * - Parameter + - Default Value + - Description + * - ``algorithmFPType`` + - ``float`` + - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``. + * - ``method`` + - ``defaultDense`` + - Performance-oriented computation method; the only method supported by the algorithm. + * - ``seed`` + - + - :math:`777` for a scalar seed + - NA for a vector seed + - Initial condition for state initialization, scalar or vector: + + - Scalar, value of ``size_t`` type + - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p` diff --git a/makefile.lst b/makefile.lst index 92dc52ff521..db26829caef 100755 --- a/makefile.lst +++ b/makefile.lst @@ -65,7 +65,7 @@ multiclassclassifier += classifier k_nearest_neighbors += engines classifier logistic_regression += classifier optimization_solver objective_function engines implicit_als += engines distributions -engines += engines/mt19937 engines/mcg59 engines/mt2203 +engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/mt2203 distributions += distributions/bernoulli distributions/normal distributions/uniform tsne += @@ -95,6 +95,7 @@ CORE.ALGORITHMS.FULL := \ elastic_net \ engines \ engines/mcg59 \ + engines/mrg32k3a \ engines/mt19937 \ engines/mt2203 \ em \ @@ -309,6 +310,7 @@ JJ.ALGORITHMS := adaboost elastic_net/prediction \ engines \ engines/mcg59 \ + engines/mrg32k3a \ engines/mt19937 \ engines/mt2203 \ em_gmm \ From 58d98b093e61cf4dcff4a4f205603234a485870d Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Wed, 13 Nov 2024 07:22:26 -0800 Subject: [PATCH 04/18] fix fro mrg32k --- .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 2 +- .../dal/backend/primitives/rng/rng_dpc.cpp | 14 ++++----- .../backend/primitives/rng/test/rng_dpc.cpp | 30 ++++--------------- 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h index 07dc07b9b3a..bbe3cf2dcf9 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -45,7 +45,7 @@ class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch super1; typedef algorithms::engines::internal::BatchBaseImpl super2; - BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_mrg32k3a), super2(seed) {} + BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_MRG32K3A), super2(seed) {} void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); } diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 82ff48edab9..029fec3896c 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -148,15 +148,15 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); @@ -170,10 +170,10 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); oneapi_engine& engine_, \ const event_vector& deps); -#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 6219b8c32fe..dd16eb3d3dc 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -110,7 +110,7 @@ class rng_test : public te::policy_fixture { const Index* val_host_2_ptr = val_host_2.get_data(); for (std::int64_t el = 0; el < val_host_1.get_count(); el++) { - REQUIRE(val_host_1_ptr[el] == val_host_2_ptr[el]); + REQUIRE(abs(val_host_1_ptr[el] - val_host_2_ptr[el]) < 1); } } @@ -123,7 +123,7 @@ class rng_test : public te::policy_fixture { const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data(); for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) { - REQUIRE(val_gpu_host_2_ptr[el] == val_gpu_host_1_ptr[el]); + REQUIRE(abs(val_gpu_host_2_ptr[el] - val_gpu_host_1_ptr[el]) < 1); } } @@ -134,7 +134,7 @@ class rng_test : public te::policy_fixture { const Index* val_gpu_host_ptr = val_gpu_host.get_data(); for (std::int64_t el = 0; el < val_host.get_count(); el++) { - REQUIRE(val_gpu_host_ptr[el] == val_host_ptr[el]); + REQUIRE(abs(val_gpu_host_ptr[el] - val_host_ptr[el]) < 1); } } }; @@ -160,36 +160,21 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { this->check_results(arr_gpu, arr_host); } -using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a)); +using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a)); // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); // std::int64_t elem_count = -// GENERATE_COPY(10, 1000, 300000, 15000, 1000000, 100000000, 6100000000, 1LL * 64 * 1000000); +// GENERATE_COPY(6100000000, 1LL * 64 * 1000000); // std::int64_t seed = GENERATE_COPY(777); -// auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); -// auto arr_gpu_ptr = arr_gpu.get_mutable_data(); -// auto arr_host_ptr = arr_host.get_mutable_data(); - -// auto rn_gen = this->get_rng(); -// auto rng_engine = this->get_engine(seed); -// auto rng_engine_ = this->get_engine(seed); - -// BENCHMARK("Uniform dispatcher HOST arr" + std::to_string(elem_count)) { -// rn_gen.uniform(this->get_queue(), elem_count, arr_host_ptr, rng_engine, 0, elem_count); -// }; -// BENCHMARK("Uniform dispatcher GPU arr" + std::to_string(elem_count)) { -// rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); -// }; // auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count); // auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data(); -// auto arr_host_ptr_ = arr_host_.get_mutable_data(); // auto rn_gen_ = this->get_rng(); // auto rng_engine_1 = this->get_engine(seed); -// auto rng_engine_2 = this->get_engine(seed); + // BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { // rn_gen_.uniform_gpu(this->get_queue(), // elem_count, @@ -199,9 +184,6 @@ using rng_types_skip = COMBINE_TYPES((float), (mcg59, mrg32k3a)); // elem_count); // }; -// BENCHMARK("Uniform HOST arr" + std::to_string(elem_count)) { -// rn_gen_.uniform(elem_count, arr_host_ptr_, rng_engine_2, 0, elem_count); -// }; // } TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) { From 67ed2f6cb77434291e7537cfcccf128405850578 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Thu, 14 Nov 2024 06:10:52 -0800 Subject: [PATCH 05/18] add philox --- .../engines/philox4x32x10/philox4x32x10.h | 182 ++++++++++++++++++ .../philox4x32x10/philox4x32x10_types.h | 64 ++++++ cpp/daal/include/daal.h | 2 + cpp/daal/include/daal_win.h | 2 + .../engines/philox4x32x10/philox4x32x10.cpp | 58 ++++++ .../philox4x32x10_batch_container.h | 68 +++++++ .../philox4x32x10/philox4x32x10_batch_impl.h | 116 +++++++++++ ...lox4x32x10_dense_default_batch_fpt_cpu.cpp | 47 +++++ ...x10_dense_default_batch_fpt_dispatcher.cpp | 30 +++ .../philox4x32x10/philox4x32x10_impl.i | 49 +++++ .../philox4x32x10/philox4x32x10_kernel.h | 58 ++++++ cpp/daal/src/externals/service_rng_mkl.h | 1 + cpp/daal/src/externals/service_rng_openrng.h | 1 + cpp/daal/src/externals/service_rng_ref.h | 1 + .../dal/backend/primitives/rng/rng_cpu.hpp | 5 +- .../dal/backend/primitives/rng/rng_dpc.cpp | 12 +- .../primitives/rng/rng_engine_collection.hpp | 1 + .../dal/backend/primitives/rng/rng_gpu.hpp | 9 +- .../backend/primitives/rng/test/rng_dpc.cpp | 10 +- docs/source/daal/algorithms/engines/index.rst | 1 + makefile.lst | 4 +- 21 files changed, 715 insertions(+), 6 deletions(-) create mode 100644 cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h create mode 100644 cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i create mode 100644 cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h new file mode 100644 index 00000000000..09eae5a7cd8 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h @@ -0,0 +1,182 @@ +/* file: philox4x32x10.h */ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the Mersenne Twister engine in the batch processing mode +//-- +*/ + +#ifndef __PHILOX4X32X10_H__ +#define __PHILOX4X32X10_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" +#include "algorithms/engines/engine.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +/** + * @defgroup engines_philox4x32x10_batch Batch + * @ingroup engines_philox4x32x10 + * @{ + */ +namespace interface1 +{ +/** + * + * \brief Provides methods to run implementations of the philox4x32x10 engine. + * This class is associated with the \ref philox4x32x10::interface1::Batch "philox4x32x10::Batch" class + * and supports the method of philox4x32x10 engine computation in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of philox4x32x10 engine, double or float + * \tparam method Computation method of the engine, philox4x32x10::Method + * \tparam cpu Version of the cpu-specific implementation of the engine, daal::CpuType + */ +template +class BatchContainer : public daal::algorithms::AnalysisContainerIface +{ +public: + /** + * Constructs a container for the philox4x32x10 engine with a specified environment + * in the batch processing mode + * \param[in] daalEnv Environment object + */ + BatchContainer(daal::services::Environment::env * daalEnv); + ~BatchContainer(); + /** + * Computes the result of the philox4x32x10 engine in the batch processing mode + * + * \return Status of computations + */ + services::Status compute() DAAL_C11_OVERRIDE; +}; + +/** + * + * \brief Provides methods for philox4x32x10 engine computations in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of philox4x32x10 engine, double or float + * \tparam method Computation method of the engine, philox4x32x10::Method + * + * \par Enumerations + * - philox4x32x10::Method Computation methods for the philox4x32x10 engine + * + * \par References + * - \ref engines::interface1::Input "engines::Input" class + * - \ref engines::interface1::Result "engines::Result" class + */ +template +class DAAL_EXPORT Batch : public engines::BatchBase +{ +public: + typedef engines::BatchBase super; + + typedef typename super::InputType InputType; + typedef typename super::ResultType ResultType; + + /** + * Creates philox4x32x10 engine + * \param[in] seed Initial condition for philox4x32x10 engine + * + * \return Pointer to philox4x32x10 engine + */ + static services::SharedPtr > create(size_t seed = 777); + + /** + * Returns method of the engine + * \return Method of the engine + */ + virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; } + + /** + * Returns the structure that contains results of philox4x32x10 engine + * \return Structure that contains results of philox4x32x10 engine + */ + ResultPtr getResult() { return _result; } + + /** + * Registers user-allocated memory to store results of philox4x32x10 engine + * \param[in] result Structure to store results of philox4x32x10 engine + * + * \return Status of computations + */ + services::Status setResult(const ResultPtr & result) + { + DAAL_CHECK(result, services::ErrorNullResult) + _result = result; + _res = _result.get(); + return services::Status(); + } + + /** + * Returns a pointer to the newly allocated philox4x32x10 engine + * with a copy of input objects and parameters of this philox4x32x10 engine + * \return Pointer to the newly allocated engine + */ + services::SharedPtr > clone() const { return services::SharedPtr >(cloneImpl()); } + + /** + * Allocates memory to store the result of the philox4x32x10 engine + * + * \return Status of computations + */ + virtual services::Status allocateResult() DAAL_C11_OVERRIDE + { + services::Status s = this->_result->template allocate(&(this->input), NULL, (int)method); + this->_res = this->_result.get(); + return s; + } + +protected: + Batch(size_t seed = 777) { initialize(); } + + Batch(const Batch & other) : super(other) { initialize(); } + + virtual Batch * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch(*this); } + + void initialize() + { + Analysis::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env); + _in = &input; + _result.reset(new ResultType()); + } + +private: + ResultPtr _result; + + Batch & operator=(const Batch &); +}; +typedef services::SharedPtr > philox4x32x10Ptr; +typedef services::SharedPtr > philox4x32x10ConstPtr; + +} // namespace interface1 +using interface1::BatchContainer; +using interface1::Batch; +using interface1::philox4x32x10Ptr; +using interface1::philox4x32x10ConstPtr; +/** @} */ +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal +#endif diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h new file mode 100644 index 00000000000..d3da7ff32a9 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h @@ -0,0 +1,64 @@ +/* file: philox4x32x10_types.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of philox4x32x10 engine. +//-- +*/ + +#ifndef __PHILOX4X32X10_TYPES_H__ +#define __PHILOX4X32X10_TYPES_H__ + +#include "algorithms/algorithm.h" +#include "services/daal_defines.h" +#include "data_management/data/numeric_table.h" +#include "data_management/data/homogen_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +/** + * @defgroup engines_philox4x32x10 philox4x32x10 Engine + * \copydoc daal::algorithms::engines::philox4x32x10 + * @ingroup engines + * @{ + */ +/** + * \brief Contains classes for philox4x32x10 engine + */ +namespace philox4x32x10 +{ +/** + * + * Available methods to compute philox4x32x10 engine + */ +enum Method +{ + defaultDense = 0 /*!< Default: performance-oriented method. */ +}; + +} // namespace philox4x32x10 +/** @} */ +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h index 443f237e051..375d9c0b3b8 100755 --- a/cpp/daal/include/daal.h +++ b/cpp/daal/include/daal.h @@ -303,6 +303,8 @@ #include "algorithms/engines/mcg59/mcg59_types.h" #include "algorithms/engines/mrg32k3a/mrg32k3a.h" #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" #include "algorithms/engines/engine_family.h" #include "algorithms/engines/mt2203/mt2203.h" #include "algorithms/engines/mt2203/mt2203_types.h" diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h index 3e64c38660f..87b1155a0a4 100755 --- a/cpp/daal/include/daal_win.h +++ b/cpp/daal/include/daal_win.h @@ -315,6 +315,8 @@ #include "algorithms/engines/mcg59/mcg59_types.h" #include "algorithms/engines/mrg32k3a/mrg32k3a.h" #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" #include "algorithms/engines/engine_family.h" #include "algorithms/engines/mt2203/mt2203.h" #include "algorithms/engines/mt2203/mt2203_types.h" diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp new file mode 100644 index 00000000000..78b1014663a --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp @@ -0,0 +1,58 @@ +/* file: philox4x32x10.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of philox4x32x10 engine +//-- + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/externals/service_dispatch.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +using namespace daal::services; +using namespace philox4x32x10::internal; + +template +SharedPtr > Batch::create(size_t seed) +{ + SharedPtr > engPtr; +#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl(__VA_ARGS__)); + + DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed); + +#undef DAAL_CREATE_ENGINE_CPU + return engPtr; +} + +template class Batch; +template class Batch; + +} // namespace interface1 +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h new file mode 100644 index 00000000000..fcffa11e0d7 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h @@ -0,0 +1,68 @@ +/* file: philox4x32x10_batch_container.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of philox4x32x10 calculation algorithm container. +//-- +*/ + +#ifndef __philox4x32x10_BATCH_CONTAINER_H__ +#define __philox4x32x10_BATCH_CONTAINER_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +template +BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface(daalEnv) +{ + __DAAL_INITIALIZE_KERNELS(internal::philox4x32x10Kernel, algorithmFPType, method); +} + +template +BatchContainer::~BatchContainer() +{ + __DAAL_DEINITIALIZE_KERNELS(); +} + +template +services::Status BatchContainer::compute() +{ + daal::services::Environment::env & env = *_env; + engines::Result * result = static_cast(_res); + NumericTable * resultTable = result->get(engines::randomNumbers).get(); + + __DAAL_CALL_KERNEL(env, internal::philox4x32x10Kernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable); +} + +} // namespace interface1 +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h new file mode 100644 index 00000000000..8495fb3b883 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -0,0 +1,116 @@ +/* file: philox4x32x10_batch_impl.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the class defining the philox4x32x10 engine +//-- +*/ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/engines/engine_batch_impl.h" +#include "src/externals/service_rng.h" +#include "src/data_management/service_numeric_table.h" + +static const int leapfrogMethodErrcode = -1002; +static const int skipAheadMethodErrcode = -1003; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +template +class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch, public algorithms::engines::internal::BatchBaseImpl +{ +public: + typedef algorithms::engines::philox4x32x10::interface1::Batch super1; + typedef algorithms::engines::internal::BatchBaseImpl super2; + BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_PHILOX4X32X10), super2(seed) {} + + void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); } + + int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); } + + services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE + { + int errcode = baseRng.leapfrog(threadNum, nThreads); + services::Status s; + if (errcode == leapfrogMethodErrcode) + s.add(ErrorLeapfrogUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE + { + int errcode = baseRng.skipAhead(nSkip); + services::Status s; + if (errcode == skipAheadMethodErrcode) + s.add(ErrorSkipAheadUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + virtual BatchImpl * cloneImpl() const DAAL_C11_OVERRIDE + { + return new BatchImpl(*this); + } + + bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE + { + switch (technique) + { + case engines::internal::family: return false; + case engines::internal::skipahead: return true; + case engines::internal::leapfrog: return true; + } + return false; + } + + ~BatchImpl() {} + +protected: + BatchImpl(const BatchImpl & other) : super1(other), super2(other), baseRng(other.baseRng) {} + + daal::internal::BaseRNGsInst baseRng; +}; + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp new file mode 100644 index 00000000000..712bd3f7300 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp @@ -0,0 +1,47 @@ +/* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of philox4x32x10 calculation functions. +//-- + +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +template class BatchContainer; +} // namespace interface1 + +namespace internal +{ +template class philox4x32x10Kernel; +} // namespace internal + +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp new file mode 100644 index 00000000000..225d9f02da1 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp @@ -0,0 +1,30 @@ +/* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of philox4x32x10 calculation algorithm dispatcher. +//-- + +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h" + +namespace daal +{ +namespace algorithms +{ +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::philox4x32x10::BatchContainer, batch, DAAL_FPTYPE, engines::philox4x32x10::defaultDense) +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i new file mode 100644 index 00000000000..6c113d179c8 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i @@ -0,0 +1,49 @@ +/* file: philox4x32x10_impl.i */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of philox4x32x10 algorithm +//-- +*/ + +#ifndef __philox4x32x10_IMPL_I__ +#define __philox4x32x10_IMPL_I__ + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +template +Status philox4x32x10Kernel::compute(NumericTable * resultTensor) +{ + return Status(); +} + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h new file mode 100644 index 00000000000..28b689a9ab8 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h @@ -0,0 +1,58 @@ +/* file: philox4x32x10_kernel.h */ +/******************************************************************************* +* Copyright 2014 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Declaration of template function that calculate philox4x32x10s. +//-- + +#ifndef __philox4x32x10_KERNEL_H__ +#define __philox4x32x10_KERNEL_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/kernel.h" +#include "data_management/data/numeric_table.h" + +using namespace daal::services; +using namespace daal::data_management; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +/** + * \brief Kernel for philox4x32x10 calculation + */ +template +class philox4x32x10Kernel : public Kernel +{ +public: + Status compute(NumericTable * resultTable); +}; + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h index a911b2e5d8d..83edae913e2 100644 --- a/cpp/daal/src/externals/service_rng_mkl.h +++ b/cpp/daal/src/externals/service_rng_mkl.h @@ -33,6 +33,7 @@ #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 #define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A +#define __DAAL_BRNG_PHILOX4X32X10 VSL_BRNG_PHILOX4X32X10 #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h index 96c567b7366..3d1b9833a52 100644 --- a/cpp/daal/src/externals/service_rng_openrng.h +++ b/cpp/daal/src/externals/service_rng_openrng.h @@ -26,6 +26,7 @@ #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 #define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A +#define __DAAL_BRNG_PHILOX4X32X10 VSL_BRNG_PHILOX4X32X10 #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h index 7eafa70fb43..3890d0bdf2b 100644 --- a/cpp/daal/src/externals/service_rng_ref.h +++ b/cpp/daal/src/externals/service_rng_ref.h @@ -41,6 +41,7 @@ #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 //tmp #define __DAAL_BRNG_MRG32K3A (1 << 20) * 4 //VSL_BRNG_MRG32K3A + #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 4 //VSL_BRNG_MRG32K3A #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF 0 //VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index b9488da808b..6b517b73c17 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include #include @@ -27,7 +28,7 @@ #include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::backend::primitives { -enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a }; +enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; template class daal_engine { @@ -77,6 +78,8 @@ class daal_engine { return daal::algorithms::engines::mcg59::Batch<>::create(seed); case engine_list_cpu::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); + case engine_list_cpu::philox4x32x10: + return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); case engine_list_cpu::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 029fec3896c..7e8a69eba98 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -95,15 +95,18 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_list::mt2203) \ INSTANTIATE_(float, Size, engine_list::mcg59) \ - INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_list::philox4x32x10) \ INSTANTIATE_(float, Size, engine_list::mt19937) \ INSTANTIATE_(double, Size, engine_list::mt2203) \ INSTANTIATE_(double, Size, engine_list::mcg59) \ INSTANTIATE_(double, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(double, Size, engine_list::philox4x32x10) \ INSTANTIATE_(double, Size, engine_list::mt19937) \ INSTANTIATE_(int, Size, engine_list::mt2203) \ INSTANTIATE_(int, Size, engine_list::mcg59) \ INSTANTIATE_(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_(std::int64_t); @@ -121,14 +124,17 @@ INSTANTIATE_FLOAT_(std::int32_t); INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10) \ INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \ INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_CPU(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_CPU(std::int64_t); @@ -149,14 +155,17 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); @@ -174,6 +183,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index dd7bffd68d7..1f0f5c65225 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include "oneapi/dal/table/common.hpp" diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp index 9cf27a2f4ee..a90b66c49a7 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include @@ -27,7 +28,7 @@ namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a}; +enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10}; template struct oneapi_engine_type; @@ -52,6 +53,11 @@ struct oneapi_engine_type { using type = oneapi::mkl::rng::mrg32k3a; }; +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::philox4x32x10; +}; + template class oneapi_engine { public: @@ -101,6 +107,7 @@ class oneapi_engine { return daal::algorithms::engines::mt2203::Batch<>::create(seed); case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); + case engine_list::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); case engine_list::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index dd16eb3d3dc..bdefc472a58 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -28,6 +28,7 @@ class mt2203 {}; class mcg59 {}; class mrg32k3a {}; class mt19937 {}; +class philox4x32x10 {}; template struct engine_map {}; @@ -47,6 +48,11 @@ struct engine_map { constexpr static auto value = engine_list::mrg32k3a; }; +template <> +struct engine_map { + constexpr static auto value = engine_list::philox4x32x10; +}; + template <> struct engine_map { constexpr static auto value = engine_list::mt19937; @@ -139,7 +145,7 @@ class rng_test : public te::policy_fixture { } }; -using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a)); +using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { SKIP_IF(this->get_policy().is_cpu()); @@ -160,7 +166,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { this->check_results(arr_gpu, arr_host); } -using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a)); +using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10)); // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst index 1c476178dc9..34113a6dd38 100644 --- a/docs/source/daal/algorithms/engines/index.rst +++ b/docs/source/daal/algorithms/engines/index.rst @@ -114,4 +114,5 @@ These methods are represented with member functions of classes that represent fu mt19937.rst mcg59.rst mrg32k3a.rst + philox4x32x10.rst mt2203.rst diff --git a/makefile.lst b/makefile.lst index db26829caef..b042ede80a7 100755 --- a/makefile.lst +++ b/makefile.lst @@ -65,7 +65,7 @@ multiclassclassifier += classifier k_nearest_neighbors += engines classifier logistic_regression += classifier optimization_solver objective_function engines implicit_als += engines distributions -engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/mt2203 +engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/philox4x32x10 engines/mt2203 distributions += distributions/bernoulli distributions/normal distributions/uniform tsne += @@ -96,6 +96,7 @@ CORE.ALGORITHMS.FULL := \ engines \ engines/mcg59 \ engines/mrg32k3a \ + engines/philox4x32x10 \ engines/mt19937 \ engines/mt2203 \ em \ @@ -311,6 +312,7 @@ JJ.ALGORITHMS := adaboost engines \ engines/mcg59 \ engines/mrg32k3a \ + engines/philox4x32x10 \ engines/mt19937 \ engines/mt2203 \ em_gmm \ From 4941b236ff57f3922a80551118e7ffa0e2969cef Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 15 Nov 2024 02:20:22 -0800 Subject: [PATCH 06/18] a lot of fixes with rng --- .../algorithms/engines/mrg32k3a/mrg32k3a.h | 3 +- .../philox4x32x10/philox4x32x10_batch_impl.h | 2 +- cpp/daal/src/externals/service_rng_ref.h | 5 +- .../vertex_partitioning_default_kernel.hpp | 2 +- .../backend/gpu/train_kernel_hist_impl.hpp | 2 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 8 +- .../algo/louvain/backend/cpu/louvain_data.hpp | 2 +- .../vertex_partitioning_default_kernel.hpp | 2 +- .../optimizers/test/newton_cg_dpc.cpp | 4 +- .../dal/backend/primitives/rng/rng_cpu.hpp | 34 +++--- .../dal/backend/primitives/rng/rng_dpc.cpp | 16 +-- .../dal/backend/primitives/rng/rng_engine.hpp | 101 ----------------- .../primitives/rng/rng_engine_collection.hpp | 16 +-- .../dal/backend/primitives/rng/rng_gpu.hpp | 54 +++++---- .../dal/backend/primitives/rng/rng_types.hpp | 27 +++++ .../backend/primitives/rng/test/rng_dpc.cpp | 104 +++++++----------- 16 files changed, 134 insertions(+), 248 deletions(-) delete mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h index df6c1edf414..c35eb6a9f09 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -17,7 +17,8 @@ /* //++ -// Implementation of the Mersenne Twister engine in the batch processing mode +// Implementation of the 32-bit combined multiple recursive generator with two components of order 3 +// in the batch processing mode. //-- */ diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h index 8495fb3b883..45e7f759729 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -96,7 +96,7 @@ class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch *components, dal::backend::primitives::daal_engine eng; dal::backend::primitives::daal_rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_cpu_engine_state(), 0, vertex_count); + rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 84e1d8f620f..1db5e078773 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -50,7 +50,7 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::daal_engine; + using rng_engine_t = pr::daal_engine; using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 10197bf0c43..42355b6caf5 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -401,7 +401,7 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; rn_gen.uniform(ctx.selected_row_total_count_, gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx].get_cpu_engine_state(), + rng_engine_list[engine_offset + node_idx], 0, ctx.row_total_count_); @@ -491,7 +491,7 @@ train_kernel_hist_impl::gen_feature_list( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(), + rng_engine_list[tree_map_ptr[node]], 0, ctx.column_count_); } @@ -539,7 +539,7 @@ train_kernel_hist_impl::gen_random_thresholds( for (Index node = 0; node < node_count; ++node) { rn_gen.uniform(ctx.selected_ftr_count_, random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(), + rng_engine_list[tree_map_ptr[node]], 0.0f, 1.0f); } @@ -1666,7 +1666,7 @@ sycl::event train_kernel_hist_impl::compute_results( rn_gen.shuffle( oob_row_count, permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state()); + engine_arr[built_tree_count + tree_idx_in_block]); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index b016a5bf6e9..d2751b3840b 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -123,7 +123,7 @@ struct louvain_data { // Total link weight in the network value_type m; - daal_engine eng; + daal_engine eng; daal_rng rn_gen; const std::int64_t vertex_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index 7b277d88283..ff78f06f833 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_cpu_engine_state(), 0, t._vertex_count); + ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index d4f5ea55fb9..b24a59386c7 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture { auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::rng rn_gen; primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); + rn_gen.uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); + rn_gen.uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index 6b517b73c17..2cb18c72c1f 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -16,21 +16,15 @@ #pragma once -#include -#include -#include -#include -#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include #include #include #include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" namespace oneapi::dal::backend::primitives { -enum class engine_list_cpu { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; - -template +template class daal_engine { public: explicit daal_engine(std::int64_t seed = 777) @@ -72,15 +66,15 @@ class daal_engine { private: daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { switch (EngineType) { - case engine_list_cpu::mt2203: + case engine_list::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list_cpu::mcg59: + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list_cpu::mrg32k3a: + case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); - case engine_list_cpu::philox4x32x10: + case engine_list::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); - case engine_list_cpu::mt19937: + case engine_list::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); } @@ -96,16 +90,20 @@ class daal_rng { daal_rng() = default; ~daal_rng() = default; - void uniform(Size count, Type* dst, void* state, Type a, Type b) { + template + void uniform(Size count, Type* dst, daal_engine daal_engine, Type a, Type b) { + auto state = daal_engine.get_cpu_engine_state(); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } + template void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, - void* state, + daal_engine daal_engine, Type a, Type b) { + auto state = daal_engine.get_cpu_engine_state(); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, @@ -114,10 +112,10 @@ class daal_rng { b); } - template >> - void shuffle(Size count, Type* dst, void* state) { + template >> + void shuffle(Size count, Type* dst, daal_engine daal_engine) { Type idx[2]; - + auto state = daal_engine.get_cpu_engine_state(); for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 7e8a69eba98..68a8eabaa0a 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -27,7 +27,7 @@ template void oneapi_rng::uniform_gpu(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, const event_vector& deps) { @@ -41,7 +41,7 @@ template template void oneapi_rng::uniform_cpu(Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b) { void* state = engine_.get_cpu_engine_state(); @@ -55,7 +55,7 @@ void oneapi_rng::uniform_without_replacement_gpu(sycl::queue& queue, Size count, Type* dst, Type* buffer, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, const event_vector& deps) { @@ -69,7 +69,7 @@ template void oneapi_rng::shuffle_gpu(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, const event_vector& deps) { Type idx[2]; @@ -87,7 +87,7 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, sycl::queue& queue, \ Size count_, \ F* dst, \ - oneapi_engine& engine_, \ + onedal_engine& engine_, \ F a, \ F b, \ const event_vector& deps); @@ -116,7 +116,7 @@ INSTANTIATE_FLOAT_(std::int32_t); template ONEDAL_EXPORT void oneapi_rng::uniform_cpu( \ Size count_, \ F* dst, \ - oneapi_engine& engine_, \ + onedal_engine& engine_, \ F a, \ F b); @@ -146,7 +146,7 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); Size count_, \ F* dst, \ F* buff, \ - oneapi_engine& engine_, \ + onedal_engine& engine_, \ F a, \ F b, \ const event_vector& deps); @@ -176,7 +176,7 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); sycl::queue& queue, \ Size count_, \ F* dst, \ - oneapi_engine& engine_, \ + onedal_engine& engine_, \ const event_vector& deps); #define INSTANTIATE_SHUFFLE_FLOAT(Size) \ diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp deleted file mode 100644 index c8ca3b13ce9..00000000000 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -#include - -#include "oneapi/dal/backend/primitives/rng/utils.hpp" - -namespace oneapi::dal::backend::primitives { - -template -class rng { -public: - rng() = default; - ~rng() = default; - - void uniform(Size count, Type* dst, void* state, Type a, Type b) { - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } - - void uniform_without_replacement(Size count, - Type* dst, - Type* buffer, - void* state, - Type a, - Type b) { - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } - - template >> - void shuffle(Size count, Type* dst, void* state) { - Type idx[2]; - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } - -private: - daal::internal::RNGsInst daal_rng_; -}; - -class engine { -public: - explicit engine(std::int64_t seed = 777) - : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) { - impl_ = dynamic_cast(engine_.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) { - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - virtual ~engine() = default; - - engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { - engine_ = eng; - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - - return *this; - } - - void* get_state() const { - return impl_->getState(); - } - -private: - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::BatchBaseImpl* impl_; -}; - -} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 1f0f5c65225..76c56f61f7c 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -20,12 +20,8 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include -#include -#include -#include -#include -#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" #include "oneapi/dal/table/common.hpp" namespace oneapi::dal::backend::primitives { @@ -43,7 +39,7 @@ class engine_collection { daal_engine_list_(count) {} template - std::vector> operator()(Op&& op) { + std::vector> operator()(Op&& op) { daal::services::Status status; for (Size i = 0; i < count_; ++i) { op(i, params_.nSkip[i]); @@ -59,7 +55,7 @@ class engine_collection { dal::backend::interop::status_to_exception(status); } - std::vector> engine_list(count_); + std::vector> engine_list(count_); for (Size i = 0; i < count_; ++i) { engine_list[i] = daal_engine_list_[i]; } @@ -108,18 +104,18 @@ class engine_collection_oneapi { seed_(seed) { engines_.reserve(count_); for (Size i = 0; i < count_; ++i) { - engines_.push_back(oneapi_engine(queue, seed_)); + engines_.push_back(onedal_engine(queue, seed_)); } } - std::vector> get_engines() const { + std::vector> get_engines() const { return engines_; } private: Size count_; std::int64_t seed_; - std::vector> engines_; + std::vector> engines_; }; #endif diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp index a90b66c49a7..a68df41c541 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp @@ -16,57 +16,51 @@ #pragma once -#include -#include -#include -#include -#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" #include namespace mkl = oneapi::mkl; namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10}; - template -struct oneapi_engine_type; +struct onedal_engine_type; template <> -struct oneapi_engine_type { +struct onedal_engine_type { using type = oneapi::mkl::rng::mt2203; }; template <> -struct oneapi_engine_type { +struct onedal_engine_type { using type = oneapi::mkl::rng::mcg59; }; template <> -struct oneapi_engine_type { +struct onedal_engine_type { using type = oneapi::mkl::rng::mt19937; }; template <> -struct oneapi_engine_type { +struct onedal_engine_type { using type = oneapi::mkl::rng::mrg32k3a; }; template <> -struct oneapi_engine_type { +struct onedal_engine_type { using type = oneapi::mkl::rng::philox4x32x10; }; template -class oneapi_engine { +class onedal_engine { public: - using onedal_engine_t = typename oneapi_engine_type::type; + using onedal_engine_t = typename onedal_engine_type::type; - explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777) + explicit onedal_engine(sycl::queue& queue, std::int64_t seed = 777) : q(queue), daal_engine_(initialize_daal_engine(seed)), - onedal_engine_(initialize_oneapi_engine(queue, seed)), + onedal_engine_(initialize_onedal_engine(queue, seed)), impl_(dynamic_cast( daal_engine_.get())) { if (!impl_) { @@ -74,7 +68,7 @@ class oneapi_engine { } } - virtual ~oneapi_engine() = default; + virtual ~onedal_engine() = default; void* get_cpu_engine_state() const { return impl_->getState(); @@ -93,6 +87,7 @@ class oneapi_engine { } void skip_ahead_gpu(size_t nSkip) { + // Will be fixed in the next oneMKL release. if constexpr (EngineType == engine_list::mt2203) { } else { @@ -114,10 +109,10 @@ class oneapi_engine { } } - onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { + onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) { if constexpr (EngineType == engine_list::mt2203) { return onedal_engine_t(queue, seed, - 0); // Aligns CPU and GPU results for mt2203 + 0); // Aligns CPU and GPU results for mt2203, impacts the performance. } else { return onedal_engine_t(queue, seed); @@ -139,7 +134,7 @@ class oneapi_rng { void uniform(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, bool distr_mode = false, @@ -149,18 +144,19 @@ class oneapi_rng { void uniform_gpu(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, const event_vector& deps = {}); template - void uniform_cpu(Size count, Type* dst, oneapi_engine& engine_, Type a, Type b); + void uniform_cpu(Size count, Type* dst, onedal_engine& engine_, Type a, Type b); + template void uniform_without_replacement(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, const event_vector& deps = {}) {} @@ -170,7 +166,7 @@ class oneapi_rng { Size count, Type* dst, Type* buff, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b, const event_vector& deps = {}); @@ -179,7 +175,7 @@ class oneapi_rng { void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, - oneapi_engine& engine_, + onedal_engine& engine_, Type a, Type b) { void* state = engine_.get_cpu_engine_state(); @@ -195,7 +191,7 @@ class oneapi_rng { template >> - void shuffle(Size count, Type* dst, oneapi_engine& engine_) { + void shuffle(Size count, Type* dst, onedal_engine& engine_) { Type idx[2]; void* state = engine_.get_cpu_engine_state(); @@ -211,13 +207,13 @@ class oneapi_rng { void shuffle_gpu(sycl::queue& queue, Size count, Type* dst, - oneapi_engine& engine_, + onedal_engine& engine_, const event_vector& deps); template >> - void shuffle_cpu(Size count, Type* dst, oneapi_engine& engine_) { + void shuffle_cpu(Size count, Type* dst, onedal_engine& engine_) { Type idx[2]; void* state = engine_.get_cpu_engine_state(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp new file mode 100644 index 00000000000..7c1691e5e85 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp @@ -0,0 +1,27 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace oneapi::dal::backend::primitives { +enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; +} diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index bdefc472a58..d3303dc61f6 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -64,83 +64,42 @@ constexpr auto engine_v = engine_map::value; template class rng_test : public te::policy_fixture { public: - using Index = std::tuple_element_t<0, TestType>; + using DataType = std::tuple_element_t<0, TestType>; using EngineType = std::tuple_element_t<1, TestType>; static constexpr auto engine_qq = engine_v; auto get_rng() const { - oneapi_rng rn_gen; + oneapi_rng rn_gen; return rn_gen; } auto get_engine(std::int64_t seed) { - auto rng_engine = oneapi_engine(this->get_queue(), seed); + auto rng_engine = onedal_engine(this->get_queue(), seed); return rng_engine; } - auto allocate_arrays(std::int64_t elem_count) { - auto& q = this->get_queue(); - auto val_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); - auto val_host = ndarray::empty({ elem_count }); - - return std::make_tuple(val_gpu, val_host); - } + auto allocate_array_host(std::int64_t elem_count) { + auto arr_host = ndarray::empty({ elem_count }); - auto allocate_arrays_shared(std::int64_t elem_count) { - auto& q = this->get_queue(); - auto val_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); - auto val_host = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); - - return std::make_tuple(val_gpu, val_host); + return arr_host; } - auto allocate_arrays_device(std::int64_t elem_count) { + auto allocate_array_device(std::int64_t elem_count) { auto& q = this->get_queue(); - auto val_gpu_1 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); - auto val_gpu_2 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); - - return std::make_tuple(val_gpu_1, val_gpu_2); - } - - auto allocate_arrays_host(std::int64_t elem_count) { - auto val_host_1 = ndarray::empty({ elem_count }); - auto val_host_2 = ndarray::empty({ elem_count }); - - return std::make_tuple(val_host_1, val_host_2); - } - - void check_results_host(const ndarray& val_host_1, - const ndarray& val_host_2) { - const Index* val_host_1_ptr = val_host_1.get_data(); + auto arr_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); - const Index* val_host_2_ptr = val_host_2.get_data(); - - for (std::int64_t el = 0; el < val_host_1.get_count(); el++) { - REQUIRE(abs(val_host_1_ptr[el] - val_host_2_ptr[el]) < 1); - } + return arr_gpu; } - void check_results_device(const ndarray& val_gpu_1, - const ndarray& val_gpu_2) { - const auto val_gpu_host_1 = val_gpu_1.to_host(this->get_queue()); - const Index* val_gpu_host_1_ptr = val_gpu_host_1.get_data(); + void check_results(const ndarray& arr_1, const ndarray& arr_2) { + const auto arr_1_host = arr_1.to_host(this->get_queue()); + const DataType* val_arr_1_host_ptr = arr_1_host.get_data(); - const auto val_gpu_host_2 = val_gpu_2.to_host(this->get_queue()); - const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data(); + const auto arr_2_host = arr_2.to_host(this->get_queue()); + const DataType* val_arr_2_host_ptr = arr_2_host.get_data(); - for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) { - REQUIRE(abs(val_gpu_host_2_ptr[el] - val_gpu_host_1_ptr[el]) < 1); - } - } - - void check_results(const ndarray& val_gpu, const ndarray& val_host) { - const Index* val_host_ptr = val_host.get_data(); - - const auto val_gpu_host = val_gpu.to_host(this->get_queue()); - const Index* val_gpu_host_ptr = val_gpu_host.get_data(); - - for (std::int64_t el = 0; el < val_host.get_count(); el++) { - REQUIRE(abs(val_gpu_host_ptr[el] - val_host_ptr[el]) < 1); + for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) { + REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 1); } } }; @@ -152,7 +111,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); std::int64_t seed = GENERATE_COPY(777, 999); - auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); @@ -166,8 +126,9 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { this->check_results(arr_gpu, arr_host); } -using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10)); +using rng_types_skip_ahead_support = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10)); +//Just for perf tests // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); // std::int64_t elem_count = @@ -192,13 +153,17 @@ using rng_types_skip = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4 // } -TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) { +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); std::int64_t seed = GENERATE_COPY(777, 999); - auto [arr_host_init_1, arr_host_init_2] = this->allocate_arrays_host(elem_count); - auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_host_init_1 = this->allocate_array_host(elem_count); + auto arr_host_init_2 = this->allocate_array_host(elem_count); + + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); + auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data(); auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data(); auto arr_gpu_ptr = arr_gpu.get_mutable_data(); @@ -214,17 +179,21 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) { rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); - this->check_results_host(arr_host_init_1, arr_host_init_2); + this->check_results(arr_host_init_1, arr_host_init_2); this->check_results(arr_gpu, arr_host); } -TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) { +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); std::int64_t seed = GENERATE_COPY(1, 777, 999); - auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_device(elem_count); - auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); + auto arr_device_init_1 = this->allocate_array_device(elem_count); + auto arr_device_init_2 = this->allocate_array_device(elem_count); + + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); + auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); auto arr_gpu_ptr = arr_gpu.get_mutable_data(); @@ -250,10 +219,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) { rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); - this->check_results_device(arr_device_init_1, arr_device_init_2); + this->check_results(arr_device_init_1, arr_device_init_2); this->check_results(arr_gpu, arr_host); } +//TODO: add engine collection test + daal_engine tests // TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); // std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); From c55bcce40a96d788e3a332610e169ca2f7509874 Mon Sep 17 00:00:00 2001 From: "Solovev, Aleksandr" Date: Mon, 18 Nov 2024 13:17:38 +0100 Subject: [PATCH 07/18] fixes --- .../algorithms/engines/mrg32k3a/mrg32k3a.h | 5 ++-- .../engines/mrg32k3a/mrg32k3a_types.h | 6 ++-- .../engines/philox4x32x10/philox4x32x10.h | 4 ++- .../philox4x32x10/philox4x32x10_types.h | 6 ++-- cpp/daal/include/daal.h | 6 ++-- cpp/daal/include/daal_win.h | 6 ++-- .../algorithms/engines/mrg32k3a/mrg32k3a.cpp | 8 ++++-- .../mrg32k3a/mrg32k3a_batch_container.h | 3 +- .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 5 ++-- .../mrg32k3a_dense_default_batch_fpt_cpu.cpp | 3 +- ...k3a_dense_default_batch_fpt_dispatcher.cpp | 3 +- .../engines/mrg32k3a/mrg32k3a_impl.i | 5 ++-- .../engines/mrg32k3a/mrg32k3a_kernel.h | 5 ++-- .../engines/philox4x32x10/philox4x32x10.cpp | 6 ++-- .../philox4x32x10_batch_container.h | 3 +- .../philox4x32x10/philox4x32x10_batch_impl.h | 3 +- ...lox4x32x10_dense_default_batch_fpt_cpu.cpp | 3 +- ...x10_dense_default_batch_fpt_dispatcher.cpp | 3 +- .../philox4x32x10/philox4x32x10_impl.i | 5 ++-- .../philox4x32x10/philox4x32x10_kernel.h | 5 ++-- cpp/daal/src/externals/service_rng.h | 28 +++++++++++-------- cpp/daal/src/externals/service_rng_ref.h | 1 + .../dal/backend/primitives/rng/rng_dpc.cpp | 2 +- .../dal/backend/primitives/rng/rng_types.hpp | 2 ++ 24 files changed, 78 insertions(+), 48 deletions(-) diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h index c35eb6a9f09..b794813a227 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -1,6 +1,7 @@ /* file: mrg32k3a.h */ /******************************************************************************* * Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +18,8 @@ /* //++ -// Implementation of the 32-bit combined multiple recursive generator with two components of order 3 -// in the batch processing mode. +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// with two components of order 3, optimized for batch processing. //-- */ diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h index 77ca9656418..a6b9d699f77 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h @@ -1,6 +1,7 @@ /* file: mrg32k3a_types.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,8 @@ /* //++ -// Implementation of mrg32k3a engine. +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// with two components of order 3, optimized for batch processing. //-- */ diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h index 09eae5a7cd8..ec82723f1f8 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h @@ -1,6 +1,7 @@ /* file: philox4x32x10.h */ /******************************************************************************* * Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,8 @@ /* //++ -// Implementation of the Mersenne Twister engine in the batch processing mode +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- */ diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h index d3da7ff32a9..74d2e884670 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h @@ -1,6 +1,7 @@ /* file: philox4x32x10_types.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,8 @@ /* //++ -// Implementation of philox4x32x10 engine. +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- */ diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h index 375d9c0b3b8..f49625f8939 100755 --- a/cpp/daal/include/daal.h +++ b/cpp/daal/include/daal.h @@ -297,6 +297,9 @@ #include "algorithms/distributions/bernoulli/bernoulli.h" #include "algorithms/distributions/bernoulli/bernoulli_types.h" #include "algorithms/engines/engine.h" +#include "algorithms/engines/engine_family.h" +#include "algorithms/engines/mt2203/mt2203.h" +#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mt19937/mt19937.h" #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" @@ -305,9 +308,6 @@ #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" #include "algorithms/engines/philox4x32x10/philox4x32x10.h" #include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" -#include "algorithms/engines/engine_family.h" -#include "algorithms/engines/mt2203/mt2203.h" -#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/dbscan/dbscan_types.h" #include "algorithms/dbscan/dbscan_batch.h" #include "algorithms/dbscan/dbscan_distributed.h" diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h index 87b1155a0a4..a15ed7db26e 100755 --- a/cpp/daal/include/daal_win.h +++ b/cpp/daal/include/daal_win.h @@ -309,6 +309,9 @@ #include "algorithms/distributions/bernoulli/bernoulli.h" #include "algorithms/distributions/bernoulli/bernoulli_types.h" #include "algorithms/engines/engine.h" +#include "algorithms/engines/engine_family.h" +#include "algorithms/engines/mt2203/mt2203.h" +#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mt19937/mt19937.h" #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" @@ -317,9 +320,6 @@ #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" #include "algorithms/engines/philox4x32x10/philox4x32x10.h" #include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" -#include "algorithms/engines/engine_family.h" -#include "algorithms/engines/mt2203/mt2203.h" -#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/dbscan/dbscan_types.h" #include "algorithms/dbscan/dbscan_batch.h" #include "algorithms/dbscan/dbscan_distributed.h" diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp index 288cb0506ee..8f10b1e1e87 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp @@ -1,6 +1,7 @@ /* file: mrg32k3a.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +17,13 @@ *******************************************************************************/ //++ -// Implementation of mrg32k3a engine +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// with two components of order 3, optimized for batch processing. //-- #include "algorithms/engines/mrg32k3a/mrg32k3a.h" -#include "src/externals/service_dispatch.h" #include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h" +#include "src/externals/service_dispatch.h" namespace daal { diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h index 1fb8f9ca991..31126c4300f 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h @@ -1,6 +1,7 @@ /* file: mrg32k3a_batch_container.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h index bbe3cf2dcf9..251caf0d3de 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -1,6 +1,7 @@ /* file: mrg32k3a_batch_impl.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,7 @@ /* //++ -// Implementation of the class defining the mrg32k3a engine +// Implementation of the class defining the mrg32k3a engine. //-- */ diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp index 2af52dd0443..1d3820053bd 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp @@ -1,6 +1,7 @@ /* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp index 482486e243f..1b3f3c618e9 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp @@ -1,6 +1,7 @@ /* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i index 5e359ecaaa3..06d670f1f7a 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i @@ -1,6 +1,7 @@ /* file: mrg32k3a_impl.i */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,7 @@ /* //++ -// Implementation of mrg32k3a algorithm +// Implementation of mrg32k3a algorithm. //-- */ diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h index 3959576ccbe..86b8d929aae 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h @@ -1,6 +1,7 @@ /* file: mrg32k3a_kernel.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,7 @@ *******************************************************************************/ //++ -// Declaration of template function that calculate mrg32k3as. +// Declaration of a template function for calculating values using the MRG32k3a generator. //-- #ifndef __mrg32k3a_KERNEL_H__ diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp index 78b1014663a..969c135a875 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp @@ -1,6 +1,7 @@ /* file: philox4x32x10.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,8 @@ *******************************************************************************/ //++ -// Implementation of philox4x32x10 engine +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- #include "algorithms/engines/philox4x32x10/philox4x32x10.h" diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h index fcffa11e0d7..7a721c4f1a8 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h @@ -1,6 +1,7 @@ /* file: philox4x32x10_batch_container.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h index 45e7f759729..fdbc4bd97f0 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -1,6 +1,7 @@ /* file: philox4x32x10_batch_impl.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp index 712bd3f7300..e1ed7b4d896 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp @@ -1,6 +1,7 @@ /* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp index 225d9f02da1..1f79b94c762 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp @@ -1,6 +1,7 @@ /* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i index 6c113d179c8..9e2dc9f6b99 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i @@ -1,6 +1,7 @@ /* file: philox4x32x10_impl.i */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +18,7 @@ /* //++ -// Implementation of philox4x32x10 algorithm +// Implementation of philox4x32x10 algorithm. //-- */ diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h index 28b689a9ab8..47333a6c78f 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h @@ -1,6 +1,7 @@ /* file: philox4x32x10_kernel.h */ /******************************************************************************* -* Copyright 2014 Intel Corporation +* Copyright 2024 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,7 @@ *******************************************************************************/ //++ -// Declaration of template function that calculate philox4x32x10s. +// Declaration of a template function for generating values using the Philox4x32-10 engine. //-- #ifndef __philox4x32x10_KERNEL_H__ diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h index d03c0f39abf..5974343a865 100644 --- a/cpp/daal/src/externals/service_rng.h +++ b/cpp/daal/src/externals/service_rng.h @@ -115,7 +115,12 @@ class RNGs int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { - Type * buffer = (Type *)daal_malloc(sizeof(Type) * n); + SizeType sequence_size = abs(b-a); + Type * buffer = (Type *)daal_malloc(sizeof(Type) * sequence_size); + for (SizeType i = 0; i < sequence_size; i++) + { + buffer[i]=i; + } int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method); daal_free(buffer); return errorcode; @@ -126,19 +131,18 @@ class RNGs const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { int errorcode = 0; + SizeType sequence_size = abs(b-a); + DstType swapIdx; + for (SizeType i = 0; i < n; i++) + { + errorcode = uniform(1, &swapIdx, state, i, n - 1, method); + auto tmp = buffer[i]; + buffer[i] = buffer[swapIdx]; + buffer[swapIdx] = tmp; + } for (SizeType i = 0; i < n; i++) { - errorcode = uniform(1, buffer + i, state, a + i, b, method); - int value = buffer[i]; - - for (SizeType j = i; j > 0; j--) - { - if (value == buffer[j - 1]) - { - value = (DstType)(j - 1 + a); - } - } - r[i] = value; + r[i] = buffer[i]; } return errorcode; } diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h index 0445614b3e3..590f0bb2aae 100644 --- a/cpp/daal/src/externals/service_rng_ref.h +++ b/cpp/daal/src/externals/service_rng_ref.h @@ -41,6 +41,7 @@ #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 #define __DAAL_BRNG_MRG32K3A (1 << 20) * 3 //VSL_BRNG_MRG32K3A #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10 + #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF 0 //VSL_RNG_METHOD_BERNOULLI_ICDF diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 68a8eabaa0a..c509db057cc 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -189,4 +189,4 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); -} // namespace oneapi::dal::backend::primitives \ No newline at end of file +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp index 7c1691e5e85..d502e9282ee 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp @@ -23,5 +23,7 @@ #include namespace oneapi::dal::backend::primitives { + enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; + } From 806a74ce28e09b86545015614acd909c6b7c8524 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Mon, 18 Nov 2024 06:06:51 -0800 Subject: [PATCH 08/18] clang + fisher yates --- .../algorithms/engines/mrg32k3a/mrg32k3a.cpp | 2 +- .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 3 +- .../engines/philox4x32x10/philox4x32x10.cpp | 2 +- .../philox4x32x10/philox4x32x10_batch_impl.h | 3 +- cpp/daal/src/externals/service_rng.h | 32 ++++--- cpp/daal/src/externals/service_rng_ref.h | 10 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 7 +- .../dal/backend/primitives/rng/rng_cpu.hpp | 7 +- .../dal/backend/primitives/rng/rng_dpc.cpp | 96 +++++++++---------- .../dal/backend/primitives/rng/rng_gpu.hpp | 12 ++- .../backend/primitives/rng/test/rng_dpc.cpp | 56 ++++++----- 11 files changed, 125 insertions(+), 105 deletions(-) diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp index 8f10b1e1e87..fe015c85428 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp @@ -17,7 +17,7 @@ *******************************************************************************/ //++ -// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator // with two components of order 3, optimized for batch processing. //-- diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h index 251caf0d3de..0ff55f39b62 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -41,7 +41,8 @@ namespace mrg32k3a namespace internal { template -class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch, public algorithms::engines::internal::BatchBaseImpl +class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch, + public algorithms::engines::internal::BatchBaseImpl { public: typedef algorithms::engines::mrg32k3a::interface1::Batch super1; diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp index 969c135a875..c103a4ae068 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp @@ -17,7 +17,7 @@ *******************************************************************************/ //++ -// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) // that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h index fdbc4bd97f0..f6a9f35e268 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -41,7 +41,8 @@ namespace philox4x32x10 namespace internal { template -class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch, public algorithms::engines::internal::BatchBaseImpl +class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch, + public algorithms::engines::internal::BatchBaseImpl { public: typedef algorithms::engines::philox4x32x10::interface1::Batch super1; diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h index 5974343a865..ba95f8f7324 100644 --- a/cpp/daal/src/externals/service_rng.h +++ b/cpp/daal/src/externals/service_rng.h @@ -115,12 +115,7 @@ class RNGs int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { - SizeType sequence_size = abs(b-a); - Type * buffer = (Type *)daal_malloc(sizeof(Type) * sequence_size); - for (SizeType i = 0; i < sequence_size; i++) - { - buffer[i]=i; - } + Type * buffer = (Type *)daal_malloc(sizeof(Type) * 1); int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method); daal_free(buffer); return errorcode; @@ -130,19 +125,28 @@ class RNGs int uniformWithoutReplacement(const SizeType n, DstType * r, Type * buffer, void * state, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { - int errorcode = 0; - SizeType sequence_size = abs(b-a); - DstType swapIdx; + int errorcode = 0; + SizeType sequence_size = abs(b - a); + if (sequence_size < n) + { + return -1; + } + Type * buffer_ = (Type *)daal_malloc(sizeof(Type) * sequence_size); + for (SizeType i = 0; i < sequence_size; i++) + { + buffer_[i] = i; + } + Type swapIdx; for (SizeType i = 0; i < n; i++) { - errorcode = uniform(1, &swapIdx, state, i, n - 1, method); - auto tmp = buffer[i]; - buffer[i] = buffer[swapIdx]; - buffer[swapIdx] = tmp; + errorcode = uniform(1, &swapIdx, state, i, sequence_size, method); + int index = int(swapIdx); + + std::swap(buffer_[i], buffer_[index]); } for (SizeType i = 0; i < n; i++) { - r[i] = buffer[i]; + r[i] = buffer_[i]; } return errorcode; } diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h index 590f0bb2aae..6b7aa53359e 100644 --- a/cpp/daal/src/externals/service_rng_ref.h +++ b/cpp/daal/src/externals/service_rng_ref.h @@ -36,11 +36,11 @@ #include // RNGs - #define __DAAL_BRNG_MT2203 (1 << 20) * 9 //VSL_BRNG_MT2203 - #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937 - #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 - #define __DAAL_BRNG_MRG32K3A (1 << 20) * 3 //VSL_BRNG_MRG32K3A - #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10 + #define __DAAL_BRNG_MT2203 (1 << 20) * 9 //VSL_BRNG_MT2203 + #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937 + #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 + #define __DAAL_BRNG_MRG32K3A (1 << 20) * 3 //VSL_BRNG_MRG32K3A + #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10 #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 42355b6caf5..c0ee89c4d64 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -1663,10 +1663,9 @@ sycl::event train_kernel_hist_impl::compute_results( pr::daal_rng rn_gen; for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle( - oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block]); + rn_gen.shuffle(oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block]); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index 2cb18c72c1f..6c602aa6612 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -68,8 +68,7 @@ class daal_engine { switch (EngineType) { case engine_list::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list::mcg59: - return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); case engine_list::philox4x32x10: @@ -112,7 +111,9 @@ class daal_rng { b); } - template >> + template >> void shuffle(Size count, Type* dst, daal_engine daal_engine) { Type idx[2]; auto state = daal_engine.get_cpu_engine_state(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index c509db057cc..1fa9e36a679 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -92,21 +92,21 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, F b, \ const event_vector& deps); -#define INSTANTIATE_FLOAT_(Size) \ - INSTANTIATE_(float, Size, engine_list::mt2203) \ - INSTANTIATE_(float, Size, engine_list::mcg59) \ - INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_(float, Size, engine_list::mt19937) \ - INSTANTIATE_(double, Size, engine_list::mt2203) \ - INSTANTIATE_(double, Size, engine_list::mcg59) \ - INSTANTIATE_(double, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_(double, Size, engine_list::mt19937) \ - INSTANTIATE_(int, Size, engine_list::mt2203) \ - INSTANTIATE_(int, Size, engine_list::mcg59) \ - INSTANTIATE_(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(int, Size, engine_list::philox4x32x10) \ +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_list::mt2203) \ + INSTANTIATE_(float, Size, engine_list::mcg59) \ + INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_list::philox4x32x10) \ + INSTANTIATE_(float, Size, engine_list::mt19937) \ + INSTANTIATE_(double, Size, engine_list::mt2203) \ + INSTANTIATE_(double, Size, engine_list::mcg59) \ + INSTANTIATE_(double, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(double, Size, engine_list::philox4x32x10) \ + INSTANTIATE_(double, Size, engine_list::mt19937) \ + INSTANTIATE_(int, Size, engine_list::mt2203) \ + INSTANTIATE_(int, Size, engine_list::mcg59) \ + INSTANTIATE_(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_(std::int64_t); @@ -120,21 +120,21 @@ INSTANTIATE_FLOAT_(std::int32_t); F a, \ F b); -#define INSTANTIATE_FLOAT_CPU(Size) \ - INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ - INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ - INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10) \ +#define INSTANTIATE_FLOAT_CPU(Size) \ + INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10) \ + INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \ + INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_CPU(int, Size, engine_list::mt19937) INSTANTIATE_FLOAT_CPU(std::int64_t); @@ -151,21 +151,21 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); F b, \ const event_vector& deps); -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \ +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); @@ -179,10 +179,10 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); onedal_engine& engine_, \ const event_vector& deps); -#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp index a68df41c541..8e62ca88c1b 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp @@ -101,8 +101,10 @@ class onedal_engine { case engine_list::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); - case engine_list::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); + case engine_list::mrg32k3a: + return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); + case engine_list::philox4x32x10: + return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); case engine_list::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); @@ -111,8 +113,10 @@ class onedal_engine { onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) { if constexpr (EngineType == engine_list::mt2203) { - return onedal_engine_t(queue, seed, - 0); // Aligns CPU and GPU results for mt2203, impacts the performance. + return onedal_engine_t( + queue, + seed, + 0); // Aligns CPU and GPU results for mt2203, impacts the performance. } else { return onedal_engine_t(queue, seed); diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index d3303dc61f6..719fe429411 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -73,6 +73,16 @@ class rng_test : public te::policy_fixture { return rn_gen; } + auto get_daal_rng() const { + daal_rng rn_gen; + return rn_gen; + } + + auto get_daal_engine(std::int64_t seed) { + auto rng_engine = daal_engine(seed); + return rng_engine; + } + auto get_engine(std::int64_t seed) { auto rng_engine = onedal_engine(this->get_queue(), seed); return rng_engine; @@ -126,32 +136,32 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { this->check_results(arr_gpu, arr_host); } -using rng_types_skip_ahead_support = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10)); +using rng_types_skip_ahead_support = COMBINE_TYPES((float), + (mt19937, mcg59, mrg32k3a, philox4x32x10)); //Just for perf tests -// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { -// SKIP_IF(this->get_policy().is_cpu()); -// std::int64_t elem_count = -// GENERATE_COPY(6100000000, 1LL * 64 * 1000000); -// std::int64_t seed = GENERATE_COPY(777); - - -// auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count); -// auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data(); - -// auto rn_gen_ = this->get_rng(); -// auto rng_engine_1 = this->get_engine(seed); - -// BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { -// rn_gen_.uniform_gpu(this->get_queue(), -// elem_count, -// arr_gpu_ptr_, -// rng_engine_1, -// 0, -// elem_count); -// }; +TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) { + SKIP_IF(this->get_policy().is_cpu()); + std::int64_t elem_count = GENERATE_COPY(10000); + std::int64_t seed = GENERATE_COPY(777); -// } + auto arr_host = this->allocate_array_host(elem_count); + auto arr_host_ptr_ = arr_host.get_mutable_data(); + + auto arr_host_fake = this->allocate_array_host(1); + auto arr_host_ptr_fake = arr_host_fake.get_mutable_data(); + auto rn_gen_ = this->get_daal_rng(); + auto rng_engine_1 = this->get_daal_engine(seed); + + BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { + rn_gen_.uniform_without_replacement_cpu(elem_count, + arr_host_ptr_, + arr_host_ptr_fake, + rng_engine_1, + 0, + elem_count); + }; +} TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); From cc85e37f1adb45066551995232a374a2c9aed1f7 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 19 Nov 2024 03:22:40 -0800 Subject: [PATCH 09/18] refactoring --- .../rng/{rng_cpu.hpp => engine_cpu.hpp} | 41 ----- .../rng/{rng_gpu.hpp => engine_gpu.hpp} | 106 +------------ cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 132 +++++++++++++++- .../dal/backend/primitives/rng/rng_dpc.cpp | 142 +++++++----------- .../backend/primitives/rng/test/rng_dpc.cpp | 11 +- 5 files changed, 193 insertions(+), 239 deletions(-) rename cpp/oneapi/dal/backend/primitives/rng/{rng_cpu.hpp => engine_cpu.hpp} (65%) rename cpp/oneapi/dal/backend/primitives/rng/{rng_gpu.hpp => engine_gpu.hpp} (51%) diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp similarity index 65% rename from cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp rename to cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp index 6c602aa6612..e8286f83051 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp @@ -83,45 +83,4 @@ class daal_engine { daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; -template -class daal_rng { -public: - daal_rng() = default; - ~daal_rng() = default; - - template - void uniform(Size count, Type* dst, daal_engine daal_engine, Type a, Type b) { - auto state = daal_engine.get_cpu_engine_state(); - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } - - template - void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - daal_engine daal_engine, - Type a, - Type b) { - auto state = daal_engine.get_cpu_engine_state(); - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } - - template >> - void shuffle(Size count, Type* dst, daal_engine daal_engine) { - Type idx[2]; - auto state = daal_engine.get_cpu_engine_state(); - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } -}; - } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp similarity index 51% rename from cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp rename to cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp index 8e62ca88c1b..242f71cea65 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp @@ -95,6 +95,10 @@ class onedal_engine { } } + sycl::queue& get_queue() { + return q; + } + private: daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { switch (EngineType) { @@ -128,107 +132,5 @@ class onedal_engine { daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; -template -class oneapi_rng { -public: - oneapi_rng() = default; - ~oneapi_rng() = default; - - template - void uniform(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - Type a, - Type b, - bool distr_mode = false, - const event_vector& deps = {}); - - template - void uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); - - template - void uniform_cpu(Size count, Type* dst, onedal_engine& engine_, Type a, Type b); - - template - void uniform_without_replacement(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}) {} - - template - void uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buff, - onedal_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); - - template - void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - onedal_engine& engine_, - Type a, - Type b) { - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_gpu(count); - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } - - template >> - void shuffle(Size count, Type* dst, onedal_engine& engine_) { - Type idx[2]; - - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_gpu(count); - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } - - template - void shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - const event_vector& deps); - - template >> - void shuffle_cpu(Size count, Type* dst, onedal_engine& engine_) { - Type idx[2]; - - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_gpu(count); - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } -}; - #endif } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index a89ca3d4505..b93729dbdf7 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -16,10 +16,138 @@ #pragma once -#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp" +#include "oneapi/dal/backend/primitives/rng/engine_cpu.hpp" #ifdef ONEDAL_DATA_PARALLEL -#include "oneapi/dal/backend/primitives/rng/rng_gpu.hpp" +#include "oneapi/dal/backend/primitives/rng/engine_gpu.hpp" #endif + +namespace oneapi::dal::backend::primitives { +template +class rng { +public: + rng() = default; + ~rng() = default; + + template + void uniform_cpu(Size count, Type* dst, daal_engine daal_engine, Type a, Type b) { + auto state = daal_engine.get_cpu_engine_state(); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + } + +#ifdef ONEDAL_DATA_PARALLEL + template + void uniform_cpu(Size count, Type* dst, onedal_engine& engine_, Type a, Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + auto state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + } +#endif + + template + void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + daal_engine daal_engine, + Type a, + Type b) { + auto state = daal_engine.get_cpu_engine_state(); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } +#ifdef ONEDAL_DATA_PARALLEL + template + void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + onedal_engine& engine_, + Type a, + Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } +#endif + + template >> + void shuffle_cpu(Size count, Type* dst, daal_engine daal_engine) { + Type idx[2]; + auto state = daal_engine.get_cpu_engine_state(); + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } + +#ifdef ONEDAL_DATA_PARALLEL + template >> + void shuffle_cpu(Size count, Type* dst, onedal_engine& engine_) { + Type idx[2]; + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } +#endif + +#ifdef ONEDAL_DATA_PARALLEL + template + void uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + onedal_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + + template + void uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + onedal_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + + template + void shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + onedal_engine& engine_, + const event_vector& deps = {}); +}; + +#endif + +}; // namespace oneapi::dal::backend::primitives \ No newline at end of file diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 1fa9e36a679..2e3a0c962c8 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -24,55 +24,55 @@ namespace bk = oneapi::dal::backend; template template -void oneapi_rng::uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +void rng::uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + onedal_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } oneapi::mkl::rng::uniform distr(a, b); auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps }); event.wait_and_throw(); engine_.skip_ahead_cpu(count); } +//Currently only CPU impl template template -void oneapi_rng::uniform_cpu(Size count, - Type* dst, - onedal_engine& engine_, - Type a, - Type b) { - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_gpu(count); - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); -} - -template -template -void oneapi_rng::uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buffer, - onedal_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +void rng::uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + onedal_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } void* state = engine_.get_cpu_engine_state(); engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); } +//Currently only CPU impl template template -void oneapi_rng::shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - onedal_engine& engine_, - const event_vector& deps) { +void rng::shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + onedal_engine& engine_, + const event_vector& deps) { Type idx[2]; - + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } void* state = engine_.get_cpu_engine_state(); engine_.skip_ahead_gpu(count); @@ -82,15 +82,14 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, } } -#define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::uniform_gpu( \ - sycl::queue& queue, \ - Size count_, \ - F* dst, \ - onedal_engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::uniform_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + onedal_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_list::mt2203) \ @@ -112,43 +111,15 @@ void oneapi_rng::shuffle_gpu(sycl::queue& queue, INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); -#define INSTANTIATE_CPU(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::uniform_cpu( \ - Size count_, \ - F* dst, \ - onedal_engine& engine_, \ - F a, \ - F b); - -#define INSTANTIATE_FLOAT_CPU(Size) \ - INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ - INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(double, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ - INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ - INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ - INSTANTIATE_CPU(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_CPU(int, Size, engine_list::philox4x32x10) \ - INSTANTIATE_CPU(int, Size, engine_list::mt19937) - -INSTANTIATE_FLOAT_CPU(std::int64_t); -INSTANTIATE_FLOAT_CPU(std::int32_t); - -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::uniform_without_replacement_gpu( \ - sycl::queue& queue, \ - Size count_, \ - F* dst, \ - F* buff, \ - onedal_engine& engine_, \ - F a, \ - F b, \ +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::uniform_without_replacement_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + onedal_engine& engine_, \ + F a, \ + F b, \ const event_vector& deps); #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ @@ -171,13 +142,12 @@ INSTANTIATE_FLOAT_CPU(std::int32_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); -#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::shuffle_gpu( \ - sycl::queue& queue, \ - Size count_, \ - F* dst, \ - onedal_engine& engine_, \ - const event_vector& deps); +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::shuffle_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + onedal_engine& engine_, \ + const event_vector& deps); #define INSTANTIATE_SHUFFLE_FLOAT(Size) \ INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 719fe429411..411894bdad4 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -69,12 +69,7 @@ class rng_test : public te::policy_fixture { static constexpr auto engine_qq = engine_v; auto get_rng() const { - oneapi_rng rn_gen; - return rn_gen; - } - - auto get_daal_rng() const { - daal_rng rn_gen; + rng rn_gen; return rn_gen; } @@ -150,8 +145,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_s auto arr_host_fake = this->allocate_array_host(1); auto arr_host_ptr_fake = arr_host_fake.get_mutable_data(); - auto rn_gen_ = this->get_daal_rng(); - auto rng_engine_1 = this->get_daal_engine(seed); + auto rn_gen_ = this->get_rng(); + auto rng_engine_1 = this->get_engine(seed); BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { rn_gen_.uniform_without_replacement_cpu(elem_count, From 852669fa4cc58828c4d703ffa1c8eda1a4769551 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 19 Nov 2024 05:18:57 -0800 Subject: [PATCH 10/18] fixes --- .../vertex_partitioning_default_kernel.hpp | 4 +-- .../gpu/train_kernel_hist_impl_dpc.cpp | 34 +++++++++---------- .../algo/louvain/backend/cpu/louvain_data.hpp | 2 +- .../vertex_partitioning_default_kernel.hpp | 2 +- .../objective_function/test/fixture.hpp | 4 +-- .../objective_function/test/spmd_fixture.hpp | 2 +- .../optimizers/test/cg_solver_dpc.cpp | 4 +-- .../primitives/optimizers/test/fixture.hpp | 6 ++-- .../optimizers/test/newton_cg_dpc.cpp | 10 +++--- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 3 +- 10 files changed, 35 insertions(+), 36 deletions(-) diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 55087df26af..c33575c472d 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -91,8 +91,8 @@ std::int32_t most_frequent_element(const std::atomic *components, std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); dal::backend::primitives::daal_engine eng; - dal::backend::primitives::daal_rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count); + dal::backend::primitives::rng rn_gen; + rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index c0ee89c4d64..193f731ffd4 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -396,14 +396,14 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const node_list_ptr = node_list_host.get_mutable_data(); for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - pr::daal_rng rn_gen; + pr::rng rn_gen; Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - rn_gen.uniform(ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx], - 0, - ctx.row_total_count_); + rn_gen.uniform_cpu(ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_list[engine_offset + node_idx], + 0, + ctx.row_total_count_); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; @@ -483,7 +483,7 @@ train_kernel_hist_impl::gen_feature_list( auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_); - pr::daal_rng rn_gen; + pr::rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { for (Index node = 0; node < node_count; ++node) { @@ -524,7 +524,7 @@ train_kernel_hist_impl::gen_random_thresholds( auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_); - pr::daal_rng rn_gen; + pr::rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); // Create arrays for random generated bins @@ -537,11 +537,11 @@ train_kernel_hist_impl::gen_random_thresholds( // Generate random bins for selected features for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform(ctx.selected_ftr_count_, - random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]], - 0.0f, - 1.0f); + rn_gen.uniform_cpu(ctx.selected_ftr_count_, + random_bins_host_ptr + node * ctx.selected_ftr_count_, + rng_engine_list[tree_map_ptr[node]], + 0.0f, + 1.0f); } auto event_rnd_generate = random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count()); @@ -1660,12 +1660,12 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); - pr::daal_rng rn_gen; + pr::rng rn_gen; for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block]); + rn_gen.shuffle_cpu(oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block]); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index d2751b3840b..98d4bf60047 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -124,7 +124,7 @@ struct louvain_data { value_type m; daal_engine eng; - daal_rng rn_gen; + rng rn_gen; const std::int64_t vertex_count; const std::int64_t edge_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index ff78f06f833..e758d769a01 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); + ld.rn_gen.uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index d673470b042..c0bd5049153 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -572,13 +572,13 @@ class logloss_test : public te::float_algo_fixture rn_gen; + primitives::rng rn_gen; auto vec_host = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { primitives::daal_engine eng(2007 + dim * num_checks + ij); - rn_gen.uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index e2a611c2c98..f90aa3d8a87 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -100,7 +100,7 @@ class logloss_spmd_test : public logloss_test { std::int64_t num_checks = 5; std::vector> vecs_host(num_checks), vecs_gpu(num_checks); - daal_rng rn_gen; + rng rn_gen; for (std::int64_t ij = 0; ij < num_checks; ++ij) { daal_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index 36e20f03c11..56d7f8c5c23 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -43,9 +43,9 @@ class cg_solver_test : public te::float_algo_fixture { x_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::daal_rng rn_gen; + primitives::rng rn_gen; primitives::daal_engine eng(4014 + n_); - rn_gen.uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index 777c0ee68e2..1c82e2c8ac9 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -133,11 +133,11 @@ void create_stable_matrix(sycl::queue& queue, ONEDAL_ASSERT(A.get_dimension(1) == n); auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); - primitives::daal_rng rn_gen; + primitives::rng rn_gen; primitives::daal_engine eng(2007 + n); - rn_gen.uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0); - rn_gen.uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); + rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index b24a59386c7..1358c1b8826 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host); auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); - primitives::daal_rng rn_gen; + primitives::rng rn_gen; primitives::daal_engine eng(2007 + n); - rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); - rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); + rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture { auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::rng rn_gen; primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index b93729dbdf7..462ee2a3ada 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -146,8 +146,7 @@ class rng { Type* dst, onedal_engine& engine_, const event_vector& deps = {}); -}; - #endif +}; }; // namespace oneapi::dal::backend::primitives \ No newline at end of file From 06f188580d3c89c789b66054f3464ecf2ac86194 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Mon, 16 Dec 2024 03:58:30 -0800 Subject: [PATCH 11/18] comments fixes --- .../algorithms/engines/mrg32k3a/mrg32k3a.h | 1 - .../engines/mrg32k3a/mrg32k3a_types.h | 1 - .../engines/philox4x32x10/philox4x32x10.h | 1 - .../philox4x32x10/philox4x32x10_types.h | 1 - .../algorithms/engines/mrg32k3a/mrg32k3a.cpp | 1 - .../mrg32k3a/mrg32k3a_batch_container.h | 1 - .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 1 - .../mrg32k3a_dense_default_batch_fpt_cpu.cpp | 1 - ...k3a_dense_default_batch_fpt_dispatcher.cpp | 1 - .../engines/mrg32k3a/mrg32k3a_impl.i | 5 +- .../engines/mrg32k3a/mrg32k3a_kernel.h | 5 +- .../engines/philox4x32x10/philox4x32x10.cpp | 1 - .../philox4x32x10_batch_container.h | 5 +- .../philox4x32x10/philox4x32x10_batch_impl.h | 1 - ...lox4x32x10_dense_default_batch_fpt_cpu.cpp | 1 - ...x10_dense_default_batch_fpt_dispatcher.cpp | 1 - .../philox4x32x10/philox4x32x10_impl.i | 5 +- .../philox4x32x10/philox4x32x10_kernel.h | 5 +- cpp/daal/src/externals/service_rng.h | 34 ++--- .../vertex_partitioning_default_kernel.hpp | 2 +- .../backend/gpu/train_kernel_hist_impl.hpp | 16 +-- .../gpu/train_kernel_hist_impl_dpc.cpp | 16 +-- .../algo/louvain/backend/cpu/louvain_data.hpp | 2 +- .../objective_function/test/fixture.hpp | 2 +- .../objective_function/test/spmd_fixture.hpp | 2 +- .../optimizers/test/cg_solver_dpc.cpp | 2 +- .../primitives/optimizers/test/fixture.hpp | 2 +- .../optimizers/test/newton_cg_dpc.cpp | 2 +- .../rng/{engine_gpu.hpp => dpc_engine.hpp} | 68 +++++----- .../rng/{engine_cpu.hpp => host_engine.hpp} | 41 +++--- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 52 ++++---- .../dal/backend/primitives/rng/rng_dpc.cpp | 118 +++++++++--------- .../primitives/rng/rng_engine_collection.hpp | 26 ++-- .../dal/backend/primitives/rng/rng_types.hpp | 2 +- .../backend/primitives/rng/test/rng_dpc.cpp | 66 +++++----- .../daal/algorithms/engines/mrg32k3a.rst | 3 +- 36 files changed, 236 insertions(+), 258 deletions(-) rename cpp/oneapi/dal/backend/primitives/rng/{engine_gpu.hpp => dpc_engine.hpp} (62%) rename cpp/oneapi/dal/backend/primitives/rng/{engine_cpu.hpp => host_engine.hpp} (70%) diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h index b794813a227..518d26e01f1 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -1,6 +1,5 @@ /* file: mrg32k3a.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h index a6b9d699f77..8d697dfd72a 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h @@ -1,6 +1,5 @@ /* file: mrg32k3a_types.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h index ec82723f1f8..e57798be50a 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h @@ -1,6 +1,5 @@ /* file: philox4x32x10.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h index 74d2e884670..778b81f4ec9 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h @@ -1,6 +1,5 @@ /* file: philox4x32x10_types.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp index fe015c85428..c550d81dec6 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp @@ -1,6 +1,5 @@ /* file: mrg32k3a.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h index 31126c4300f..ce83f554026 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h @@ -1,6 +1,5 @@ /* file: mrg32k3a_batch_container.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h index 0ff55f39b62..469ec92a0ab 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -1,6 +1,5 @@ /* file: mrg32k3a_batch_impl.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp index 1d3820053bd..529c4af2635 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp @@ -1,6 +1,5 @@ /* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp index 1b3f3c618e9..fd78108df73 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp @@ -1,6 +1,5 @@ /* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i index 06d670f1f7a..f8f12b2deea 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i @@ -1,6 +1,5 @@ /* file: mrg32k3a_impl.i */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,8 +21,8 @@ //-- */ -#ifndef __mrg32k3a_IMPL_I__ -#define __mrg32k3a_IMPL_I__ +#ifndef __MRG32K3A_IMPL_I__ +#define __MRG32K3A_IMPL_I__ namespace daal { diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h index 86b8d929aae..80c9fbe44d9 100644 --- a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h @@ -1,6 +1,5 @@ /* file: mrg32k3a_kernel.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,8 +19,8 @@ // Declaration of a template function for calculating values using the MRG32k3a generator. //-- -#ifndef __mrg32k3a_KERNEL_H__ -#define __mrg32k3a_KERNEL_H__ +#ifndef __MRG32K3A_KERNEL_H__ +#define __MRG32K3A_KERNEL_H__ #include "algorithms/engines/mrg32k3a/mrg32k3a.h" #include "src/algorithms/kernel.h" diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp index c103a4ae068..47fb7dae70f 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp @@ -1,6 +1,5 @@ /* file: philox4x32x10.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h index 7a721c4f1a8..9cb747e95a8 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h @@ -1,6 +1,5 @@ /* file: philox4x32x10_batch_container.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,8 +21,8 @@ //-- */ -#ifndef __philox4x32x10_BATCH_CONTAINER_H__ -#define __philox4x32x10_BATCH_CONTAINER_H__ +#ifndef __PHILOX4X32X10_BATCH_CONTAINER_H__ +#define __PHILOX4X32X10_BATCH_CONTAINER_H__ #include "algorithms/engines/philox4x32x10/philox4x32x10.h" #include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h" diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h index f6a9f35e268..58e28eb47bf 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -1,6 +1,5 @@ /* file: philox4x32x10_batch_impl.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp index e1ed7b4d896..946517c1d9c 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp @@ -1,6 +1,5 @@ /* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp index 1f79b94c762..1640fc4ec12 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp @@ -1,6 +1,5 @@ /* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i index 9e2dc9f6b99..5aa5addc22b 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i @@ -1,6 +1,5 @@ /* file: philox4x32x10_impl.i */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,8 +21,8 @@ //-- */ -#ifndef __philox4x32x10_IMPL_I__ -#define __philox4x32x10_IMPL_I__ +#ifndef __PHILOX4X32X10_IMPL_I__ +#define __PHILOX4X32X10_IMPL_I__ namespace daal { diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h index 47333a6c78f..5870d781abd 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h @@ -1,6 +1,5 @@ /* file: philox4x32x10_kernel.h */ /******************************************************************************* -* Copyright 2024 Intel Corporation * Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,8 +19,8 @@ // Declaration of a template function for generating values using the Philox4x32-10 engine. //-- -#ifndef __philox4x32x10_KERNEL_H__ -#define __philox4x32x10_KERNEL_H__ +#ifndef __PHILOX4X32X10_KERNEL_H__ +#define __PHILOX4X32X10_KERNEL_H__ #include "algorithms/engines/philox4x32x10/philox4x32x10.h" #include "src/algorithms/kernel.h" diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h index ba95f8f7324..d03c0f39abf 100644 --- a/cpp/daal/src/externals/service_rng.h +++ b/cpp/daal/src/externals/service_rng.h @@ -115,7 +115,7 @@ class RNGs int uniformWithoutReplacement(const SizeType n, DstType * r, void * state, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { - Type * buffer = (Type *)daal_malloc(sizeof(Type) * 1); + Type * buffer = (Type *)daal_malloc(sizeof(Type) * n); int errorcode = uniformWithoutReplacement(n, r, buffer, state, a, b, method); daal_free(buffer); return errorcode; @@ -125,28 +125,20 @@ class RNGs int uniformWithoutReplacement(const SizeType n, DstType * r, Type * buffer, void * state, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD) { - int errorcode = 0; - SizeType sequence_size = abs(b - a); - if (sequence_size < n) - { - return -1; - } - Type * buffer_ = (Type *)daal_malloc(sizeof(Type) * sequence_size); - for (SizeType i = 0; i < sequence_size; i++) - { - buffer_[i] = i; - } - Type swapIdx; - for (SizeType i = 0; i < n; i++) - { - errorcode = uniform(1, &swapIdx, state, i, sequence_size, method); - int index = int(swapIdx); - - std::swap(buffer_[i], buffer_[index]); - } + int errorcode = 0; for (SizeType i = 0; i < n; i++) { - r[i] = buffer_[i]; + errorcode = uniform(1, buffer + i, state, a + i, b, method); + int value = buffer[i]; + + for (SizeType j = i; j > 0; j--) + { + if (value == buffer[j - 1]) + { + value = (DstType)(j - 1 + a); + } + } + r[i] = value; } return errorcode; } diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index c33575c472d..bdda9048082 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -90,7 +90,7 @@ std::int32_t most_frequent_element(const std::atomic *components, const std::int64_t &samples_count = 1024) { std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); - dal::backend::primitives::daal_engine eng; + dal::backend::primitives::host_engine eng; dal::backend::primitives::rng rn_gen; rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 3106eb537d7..ac04f73d89f 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -50,8 +50,8 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::daal_engine; - using rng_engine_list_t = std::vector; + using rng_engine_t = pr::host_engine; + using rng_engine_method_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; using node_t = node; @@ -79,7 +79,7 @@ class train_kernel_hist_impl { Index class_count) const; sycl::event gen_initial_tree_order(train_context_t& ctx, - rng_engine_list_t& rng_engine_list, + rng_engine_method_t& rng_engine_method, pr::ndarray& node_list, pr::ndarray& tree_order_level, Index engine_offset, @@ -115,12 +115,12 @@ class train_kernel_hist_impl { /// @param[in] ctx a training context structure for a GPU backend /// @param[in] node_count number of nodes on the current level /// @param[in] node_vs_tree_map an initial tree order - /// @param[in] rng_engine_list a list of random generator engines + /// @param[in] rng_engine_method a list of random generator engines std::tuple, sycl::event> gen_feature_list( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_list_t& rng_engine_list); + rng_engine_method_t& rng_engine_method); /// Generates random thresholds for each node and for each selected feature for node. /// Thresholds are used for a random splitter kernel to split each node. @@ -129,12 +129,12 @@ class train_kernel_hist_impl { /// @param[in] ctx a training context structure for a GPU backend /// @param[in] node_count number of nodes on the current level /// @param[in] node_vs_tree_map an initial tree order - /// @param[in] rng_engine_list a list of random generator engines + /// @param[in] rng_engine_method a list of random generator engines std::tuple, sycl::event> gen_random_thresholds( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_list_t& rng_engine_list); + rng_engine_method_t& rng_engine_method); /// Computes initial impurity for each node. /// @@ -575,7 +575,7 @@ class train_kernel_hist_impl { pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_list_t& rng_engine_arr, + const rng_engine_method_t& rng_engine_arr, Index tree_idx, Index tree_in_block, Index built_tree_count, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index bdd9b82802d..c846c77a38c 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -368,7 +368,7 @@ void train_kernel_hist_impl::allocate_buffers(const tra template sycl::event train_kernel_hist_impl::gen_initial_tree_order( train_context_t& ctx, - rng_engine_list_t& rng_engine_list, + rng_engine_method_t& rng_engine_method, pr::ndarray& node_list_host, pr::ndarray& tree_order_level, Index engine_offset, @@ -401,7 +401,7 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; rn_gen.uniform_cpu(ctx.selected_row_total_count_, gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx], + rng_engine_method[engine_offset + node_idx], 0, ctx.row_total_count_); @@ -465,7 +465,7 @@ train_kernel_hist_impl::gen_feature_list( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map_list, - rng_engine_list_t& rng_engine_list) { + rng_engine_method_t& rng_engine_method) { ONEDAL_PROFILER_TASK(gen_feature_list, queue_); ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count); @@ -491,7 +491,7 @@ train_kernel_hist_impl::gen_feature_list( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]], + rng_engine_method[tree_map_ptr[node]], 0, ctx.column_count_); } @@ -517,7 +517,7 @@ train_kernel_hist_impl::gen_random_thresholds( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_list_t& rng_engine_list) { + rng_engine_method_t& rng_engine_method) { ONEDAL_PROFILER_TASK(gen_random_thresholds, queue_); ONEDAL_ASSERT(node_vs_tree_map.get_count() == node_count); @@ -539,7 +539,7 @@ train_kernel_hist_impl::gen_random_thresholds( for (Index node = 0; node < node_count; ++node) { rn_gen.uniform_cpu(ctx.selected_ftr_count_, random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]], + rng_engine_method[tree_map_ptr[node]], 0.0f, 1.0f); } @@ -1613,7 +1613,7 @@ sycl::event train_kernel_hist_impl::compute_results( pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_list_t& engine_arr, + const rng_engine_method_t& engine_arr, Index tree_idx_in_block, Index tree_in_block_count, Index built_tree_count, @@ -1859,7 +1859,7 @@ train_result train_kernel_hist_impl::operator()( de::check_mul_overflow((ctx.tree_count_ - 1), skip_num); pr::engine_collection collection(ctx.tree_count_, desc.get_seed()); - rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { + rng_engine_method_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { skip = i * skip_num; }); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index 98d4bf60047..bd5773ff093 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -123,7 +123,7 @@ struct louvain_data { // Total link weight in the network value_type m; - daal_engine eng; + host_engine eng; rng rn_gen; const std::int64_t vertex_count; diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index f16d5777182..03f751e570b 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { - primitives::daal_engine eng(2007 + dim * num_checks + ij); + primitives::host_engine eng(2007 + dim * num_checks + ij); rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index f90aa3d8a87..985d863b1f5 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -102,7 +102,7 @@ class logloss_spmd_test : public logloss_test { std::vector> vecs_host(num_checks), vecs_gpu(num_checks); rng rn_gen; for (std::int64_t ij = 0; ij < num_checks; ++ij) { - daal_engine eng(2007 + dim * num_checks + ij); + host_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index 56d7f8c5c23..c912a3a99d2 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -44,7 +44,7 @@ class cg_solver_test : public te::float_algo_fixture { b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::rng rn_gen; - primitives::daal_engine eng(4014 + n_); + primitives::host_engine eng(4014 + n_); rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index 1c82e2c8ac9..120f65f61f0 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -134,7 +134,7 @@ void create_stable_matrix(sycl::queue& queue, auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); primitives::rng rn_gen; - primitives::daal_engine eng(2007 + n); + primitives::host_engine eng(2007 + n); rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0); rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index 1358c1b8826..d7414924bf6 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -57,7 +57,7 @@ class newton_cg_test : public te::float_algo_fixture { auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); primitives::rng rn_gen; - primitives::daal_engine eng(2007 + n); + primitives::host_engine eng(2007 + n); rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { diff --git a/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp similarity index 62% rename from cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp rename to cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp index 242f71cea65..1f13975e8d6 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/engine_gpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp @@ -19,79 +19,80 @@ #include "oneapi/dal/backend/primitives/rng/utils.hpp" #include "oneapi/dal/backend/primitives/rng/rng_types.hpp" #include + namespace mkl = oneapi::mkl; namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -template -struct onedal_engine_type; +template +struct dpc_engine_type; template <> -struct onedal_engine_type { +struct dpc_engine_type { using type = oneapi::mkl::rng::mt2203; }; template <> -struct onedal_engine_type { +struct dpc_engine_type { using type = oneapi::mkl::rng::mcg59; }; template <> -struct onedal_engine_type { +struct dpc_engine_type { using type = oneapi::mkl::rng::mt19937; }; template <> -struct onedal_engine_type { +struct dpc_engine_type { using type = oneapi::mkl::rng::mrg32k3a; }; template <> -struct onedal_engine_type { +struct dpc_engine_type { using type = oneapi::mkl::rng::philox4x32x10; }; -template -class onedal_engine { +template +class dpc_engine { public: - using onedal_engine_t = typename onedal_engine_type::type; + using dpc_engine_t = typename dpc_engine_type::type; - explicit onedal_engine(sycl::queue& queue, std::int64_t seed = 777) + explicit dpc_engine(sycl::queue& queue, std::int64_t seed = 777) : q(queue), - daal_engine_(initialize_daal_engine(seed)), - onedal_engine_(initialize_onedal_engine(queue, seed)), + host_engine_(initialize_host_engine(seed)), + dpc_engine_(initialize_dpc_engine(queue, seed)), impl_(dynamic_cast( - daal_engine_.get())) { + host_engine_.get())) { if (!impl_) { throw std::domain_error("RNG engine is not supported"); } } - virtual ~onedal_engine() = default; + virtual ~dpc_engine() = default; - void* get_cpu_engine_state() const { + void* get_host_engine_state() const { return impl_->getState(); } auto& get_cpu_engine() { - return daal_engine_; + return host_engine_; } auto& get_gpu_engine() { - return onedal_engine_; + return dpc_engine_; } void skip_ahead_cpu(size_t nSkip) { - daal_engine_->skipAhead(nSkip); + host_engine_->skipAhead(nSkip); } void skip_ahead_gpu(size_t nSkip) { // Will be fixed in the next oneMKL release. - if constexpr (EngineType == engine_list::mt2203) { + if constexpr (EngineType == engine_method::mt2203) { } else { - skip_ahead(onedal_engine_, nSkip); + skip_ahead(dpc_engine_, nSkip); } } @@ -100,35 +101,36 @@ class onedal_engine { } private: - daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) { switch (EngineType) { - case engine_list::mt2203: + case engine_method::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list::mrg32k3a: + case engine_method::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_method::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); - case engine_list::philox4x32x10: + case engine_method::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); - case engine_list::mt19937: + case engine_method::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); } } - onedal_engine_t initialize_onedal_engine(sycl::queue& queue, std::int64_t seed) { - if constexpr (EngineType == engine_list::mt2203) { - return onedal_engine_t( + dpc_engine_t initialize_dpc_engine(sycl::queue& queue, std::int64_t seed) { + if constexpr (EngineType == engine_method::mt2203) { + return dpc_engine_t( queue, seed, 0); // Aligns CPU and GPU results for mt2203, impacts the performance. } else { - return onedal_engine_t(queue, seed); + return dpc_engine_t(queue, seed); } } sycl::queue q; - daal::algorithms::engines::EnginePtr daal_engine_; - onedal_engine_t onedal_engine_; + daal::algorithms::engines::EnginePtr host_engine_; + dpc_engine_t dpc_engine_; daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; diff --git a/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp similarity index 70% rename from cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp rename to cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp index e8286f83051..436e032e608 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/engine_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,27 +24,27 @@ #include "oneapi/dal/backend/primitives/rng/rng_types.hpp" namespace oneapi::dal::backend::primitives { -template -class daal_engine { +template +class host_engine { public: - explicit daal_engine(std::int64_t seed = 777) - : daal_engine_(initialize_daal_engine(seed)), + explicit host_engine(std::int64_t seed = 777) + : host_engine_(initialize_host_engine(seed)), impl_(dynamic_cast( - daal_engine_.get())) { + host_engine_.get())) { if (!impl_) { throw std::domain_error("RNG engine is not supported"); } } - explicit daal_engine(const daal::algorithms::engines::EnginePtr& eng) : daal_engine_(eng) { + explicit host_engine(const daal::algorithms::engines::EnginePtr& eng) : host_engine_(eng) { impl_ = dynamic_cast(eng.get()); if (!impl_) { throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); } } - daal_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { - daal_engine_ = eng; + host_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { + host_engine_ = eng; impl_ = dynamic_cast(eng.get()); if (!impl_) { throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); @@ -53,33 +53,34 @@ class daal_engine { return *this; } - virtual ~daal_engine() = default; + virtual ~host_engine() = default; - void* get_cpu_engine_state() const { + void* get_host_engine_state() const { return impl_->getState(); } - auto& get_cpu_engine() { - return daal_engine_; + auto& get_host_engine() { + return host_engine_; } private: - daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) { switch (EngineType) { - case engine_list::mt2203: + case engine_method::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list::mrg32k3a: + case engine_method::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_method::mrg32k3a: return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); - case engine_list::philox4x32x10: + case engine_method::philox4x32x10: return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); - case engine_list::mt19937: + case engine_method::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); } } - daal::algorithms::engines::EnginePtr daal_engine_; + daal::algorithms::engines::EnginePtr host_engine_; daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index 462ee2a3ada..cb235b9e4c3 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -16,11 +16,11 @@ #pragma once -#include "oneapi/dal/backend/primitives/rng/engine_cpu.hpp" +#include "oneapi/dal/backend/primitives/rng/host_engine.hpp" #ifdef ONEDAL_DATA_PARALLEL -#include "oneapi/dal/backend/primitives/rng/engine_gpu.hpp" +#include "oneapi/dal/backend/primitives/rng/dpc_engine.hpp" #endif @@ -31,33 +31,33 @@ class rng { rng() = default; ~rng() = default; - template - void uniform_cpu(Size count, Type* dst, daal_engine daal_engine, Type a, Type b) { - auto state = daal_engine.get_cpu_engine_state(); + template + void uniform_cpu(Size count, Type* dst, host_engine host_engine, Type a, Type b) { + auto state = host_engine.get_host_engine_state(); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } #ifdef ONEDAL_DATA_PARALLEL - template - void uniform_cpu(Size count, Type* dst, onedal_engine& engine_, Type a, Type b) { + template + void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - auto state = engine_.get_cpu_engine_state(); + auto state = engine_.get_host_engine_state(); engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } #endif - template + template void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, - daal_engine daal_engine, + host_engine host_engine, Type a, Type b) { - auto state = daal_engine.get_cpu_engine_state(); + auto state = host_engine.get_host_engine_state(); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, @@ -66,18 +66,18 @@ class rng { b); } #ifdef ONEDAL_DATA_PARALLEL - template + template void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, - onedal_engine& engine_, + dpc_engine& engine_, Type a, Type b) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - void* state = engine_.get_cpu_engine_state(); + void* state = engine_.get_host_engine_state(); engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, @@ -88,12 +88,12 @@ class rng { } #endif - template >> - void shuffle_cpu(Size count, Type* dst, daal_engine daal_engine) { + void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { Type idx[2]; - auto state = daal_engine.get_cpu_engine_state(); + auto state = host_engine.get_host_engine_state(); for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); @@ -101,16 +101,16 @@ class rng { } #ifdef ONEDAL_DATA_PARALLEL - template >> - void shuffle_cpu(Size count, Type* dst, onedal_engine& engine_) { + void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { Type idx[2]; if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - void* state = engine_.get_cpu_engine_state(); + void* state = engine_.get_host_engine_state(); engine_.skip_ahead_gpu(count); for (Size i = 0; i < count; ++i) { @@ -121,30 +121,30 @@ class rng { #endif #ifdef ONEDAL_DATA_PARALLEL - template + template void uniform_gpu(sycl::queue& queue, Size count, Type* dst, - onedal_engine& engine_, + dpc_engine& engine_, Type a, Type b, const event_vector& deps = {}); - template + template void uniform_without_replacement_gpu(sycl::queue& queue, Size count, Type* dst, Type* buffer, - onedal_engine& engine_, + dpc_engine& engine_, Type a, Type b, const event_vector& deps = {}); - template + template void shuffle_gpu(sycl::queue& queue, Size count, Type* dst, - onedal_engine& engine_, + dpc_engine& engine_, const event_vector& deps = {}); #endif }; diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 2e3a0c962c8..cb550a50775 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -23,11 +23,11 @@ namespace oneapi::dal::backend::primitives { namespace bk = oneapi::dal::backend; template -template +template void rng::uniform_gpu(sycl::queue& queue, Size count, Type* dst, - onedal_engine& engine_, + dpc_engine& engine_, Type a, Type b, const event_vector& deps) { @@ -42,12 +42,12 @@ void rng::uniform_gpu(sycl::queue& queue, //Currently only CPU impl template -template +template void rng::uniform_without_replacement_gpu(sycl::queue& queue, Size count, Type* dst, Type* buffer, - onedal_engine& engine_, + dpc_engine& engine_, Type a, Type b, const event_vector& deps) { @@ -55,25 +55,25 @@ void rng::uniform_without_replacement_gpu(sycl::queue& queue, sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - void* state = engine_.get_cpu_engine_state(); + void* state = engine_.get_host_engine_state(); engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); } //Currently only CPU impl template -template +template void rng::shuffle_gpu(sycl::queue& queue, Size count, Type* dst, - onedal_engine& engine_, + dpc_engine& engine_, const event_vector& deps) { Type idx[2]; if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - void* state = engine_.get_cpu_engine_state(); + void* state = engine_.get_host_engine_state(); engine_.skip_ahead_gpu(count); for (Size i = 0; i < count; ++i) { @@ -82,31 +82,31 @@ void rng::shuffle_gpu(sycl::queue& queue, } } -#define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::uniform_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - onedal_engine& engine_, \ - F a, \ - F b, \ +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::uniform_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + F a, \ + F b, \ const event_vector& deps); -#define INSTANTIATE_FLOAT_(Size) \ - INSTANTIATE_(float, Size, engine_list::mt2203) \ - INSTANTIATE_(float, Size, engine_list::mcg59) \ - INSTANTIATE_(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_(float, Size, engine_list::mt19937) \ - INSTANTIATE_(double, Size, engine_list::mt2203) \ - INSTANTIATE_(double, Size, engine_list::mcg59) \ - INSTANTIATE_(double, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_(double, Size, engine_list::mt19937) \ - INSTANTIATE_(int, Size, engine_list::mt2203) \ - INSTANTIATE_(int, Size, engine_list::mcg59) \ - INSTANTIATE_(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_(int, Size, engine_list::philox4x32x10) \ - INSTANTIATE_(int, Size, engine_list::mt19937) +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_method::mt2203) \ + INSTANTIATE_(float, Size, engine_method::mcg59) \ + INSTANTIATE_(float, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(float, Size, engine_method::mt19937) \ + INSTANTIATE_(double, Size, engine_method::mt2203) \ + INSTANTIATE_(double, Size, engine_method::mcg59) \ + INSTANTIATE_(double, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(double, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(double, Size, engine_method::mt19937) \ + INSTANTIATE_(int, Size, engine_method::mt2203) \ + INSTANTIATE_(int, Size, engine_method::mcg59) \ + INSTANTIATE_(int, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(int, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(int, Size, engine_method::mt19937) INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); @@ -117,44 +117,44 @@ INSTANTIATE_FLOAT_(std::int32_t); Size count_, \ F* dst, \ F* buff, \ - onedal_engine& engine_, \ + dpc_engine& engine_, \ F a, \ F b, \ const event_vector& deps); -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); -#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::shuffle_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - onedal_engine& engine_, \ +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::shuffle_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ const event_vector& deps); -#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mrg32k3a) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::philox4x32x10) \ - INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203) \ + INSTANTIATE_SHUFFLE(int, Size, engine_method::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_method::mrg32k3a) \ + INSTANTIATE_SHUFFLE(int, Size, engine_method::philox4x32x10) \ + INSTANTIATE_SHUFFLE(int, Size, engine_method::mt19937) INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 76c56f61f7c..e7e19f64c4d 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -36,10 +36,10 @@ class engine_collection { engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)), params_(count), technique_(daal::algorithms::engines::internal::family), - daal_engine_list_(count) {} + host_engine_method_(count) {} template - std::vector> operator()(Op&& op) { + std::vector> operator()(Op&& op) { daal::services::Status status; for (Size i = 0; i < count_; ++i) { op(i, params_.nSkip[i]); @@ -49,25 +49,25 @@ class engine_collection { engine_, technique_, params_, - daal_engine_list_, + host_engine_method_, &status); if (!status) { dal::backend::interop::status_to_exception(status); } - std::vector> engine_list(count_); + std::vector> engine_method(count_); for (Size i = 0; i < count_; ++i) { - engine_list[i] = daal_engine_list_[i]; + engine_method[i] = host_engine_method_[i]; } //copy elision - return engine_list; + return engine_method; } private: void select_parallelization_technique( daal::algorithms::engines::internal::ParallelizationTechnique& technique) { - auto daal_engine_impl = + auto host_engine_impl = dynamic_cast(engine_.get()); daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = { @@ -77,7 +77,7 @@ class engine_collection { }; for (auto& techn : techniques) { - if (daal_engine_impl->hasSupport(techn)) { + if (host_engine_impl->hasSupport(techn)) { technique = techn; return; } @@ -93,10 +93,10 @@ class engine_collection { daal::algorithms::engines::internal::Params params_; daal::algorithms::engines::internal::ParallelizationTechnique technique_; daal::services::internal::TArray - daal_engine_list_; + host_engine_method_; }; -template +template class engine_collection_oneapi { public: engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777) @@ -104,18 +104,18 @@ class engine_collection_oneapi { seed_(seed) { engines_.reserve(count_); for (Size i = 0; i < count_; ++i) { - engines_.push_back(onedal_engine(queue, seed_)); + engines_.push_back(dpc_engine(queue, seed_)); } } - std::vector> get_engines() const { + std::vector> get_engines() const { return engines_; } private: Size count_; std::int64_t seed_; - std::vector> engines_; + std::vector> engines_; }; #endif diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp index d502e9282ee..4132fbe557a 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp @@ -24,6 +24,6 @@ namespace oneapi::dal::backend::primitives { -enum class engine_list { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; +enum class engine_method { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; } diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 411894bdad4..244ac91c72e 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -35,27 +35,27 @@ struct engine_map {}; template <> struct engine_map { - constexpr static auto value = engine_list::mt2203; + constexpr static auto value = engine_method::mt2203; }; template <> struct engine_map { - constexpr static auto value = engine_list::mcg59; + constexpr static auto value = engine_method::mcg59; }; template <> struct engine_map { - constexpr static auto value = engine_list::mrg32k3a; + constexpr static auto value = engine_method::mrg32k3a; }; template <> struct engine_map { - constexpr static auto value = engine_list::philox4x32x10; + constexpr static auto value = engine_method::philox4x32x10; }; template <> struct engine_map { - constexpr static auto value = engine_list::mt19937; + constexpr static auto value = engine_method::mt19937; }; template @@ -73,13 +73,13 @@ class rng_test : public te::policy_fixture { return rn_gen; } - auto get_daal_engine(std::int64_t seed) { - auto rng_engine = daal_engine(seed); + auto get_host_engine(std::int64_t seed) { + auto rng_engine = host_engine(seed); return rng_engine; } auto get_engine(std::int64_t seed) { - auto rng_engine = onedal_engine(this->get_queue(), seed); + auto rng_engine = dpc_engine(this->get_queue(), seed); return rng_engine; } @@ -134,29 +134,29 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { using rng_types_skip_ahead_support = COMBINE_TYPES((float), (mt19937, mcg59, mrg32k3a, philox4x32x10)); -//Just for perf tests -TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) { - SKIP_IF(this->get_policy().is_cpu()); - std::int64_t elem_count = GENERATE_COPY(10000); - std::int64_t seed = GENERATE_COPY(777); - - auto arr_host = this->allocate_array_host(elem_count); - auto arr_host_ptr_ = arr_host.get_mutable_data(); - - auto arr_host_fake = this->allocate_array_host(1); - auto arr_host_ptr_fake = arr_host_fake.get_mutable_data(); - auto rn_gen_ = this->get_rng(); - auto rng_engine_1 = this->get_engine(seed); - - BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { - rn_gen_.uniform_without_replacement_cpu(elem_count, - arr_host_ptr_, - arr_host_ptr_fake, - rng_engine_1, - 0, - elem_count); - }; -} +// //Just for perf tests +// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10000); +// std::int64_t seed = GENERATE_COPY(777); + +// auto arr_host = this->allocate_array_host(elem_count); +// auto arr_host_ptr_ = arr_host.get_mutable_data(); + +// auto arr_host_fake = this->allocate_array_host(1); +// auto arr_host_ptr_fake = arr_host_fake.get_mutable_data(); +// auto rn_gen_ = this->get_rng(); +// auto rng_engine_1 = this->get_engine(seed); + +// BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { +// rn_gen_.uniform_without_replacement_cpu(elem_count, +// arr_host_ptr_, +// arr_host_ptr_fake, +// rng_engine_1, +// 0, +// elem_count); +// }; +// } TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); @@ -228,13 +228,13 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe this->check_results(arr_gpu, arr_host); } -//TODO: add engine collection test + daal_engine tests +//TODO: add engine collection test + host_engine tests // TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { // SKIP_IF(this->get_policy().is_cpu()); // std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); // std::int64_t seed = GENERATE_COPY(1, 777, 999); -// engine_collection collection(this->get_queue(), 2, seed); +// engine_collection collection(this->get_queue(), 2, seed); // auto engine_arr = collection.get_engines(); diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst index ce8ca0ec0cc..1da1fd7bc15 100644 --- a/docs/source/daal/algorithms/engines/mrg32k3a.rst +++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst @@ -17,7 +17,8 @@ mrg32k3a ======== -The engine is based on the 59-bit multiplicative congruential generator. +The engine based on a 32-bit combined multiple recursive generator +with two components of order 3, optimized for batch processing. .. rubric:: Subsequence selection methods support From 72755dbd3767d62b320f5ba89b0e86b64068358b Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Mon, 16 Dec 2024 08:38:17 -0800 Subject: [PATCH 12/18] minor fixes --- .../algorithms/engines/mrg32k3a/mrg32k3a.h | 2 +- .../engines/mrg32k3a/mrg32k3a_types.h | 2 +- .../engines/philox4x32x10/philox4x32x10.h | 2 +- .../philox4x32x10/philox4x32x10_types.h | 2 +- .../vertex_partitioning_default_kernel.hpp | 7 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 33 ++- .../algo/louvain/backend/cpu/louvain_data.hpp | 1 - .../vertex_partitioning_default_kernel.hpp | 2 +- .../objective_function/test/fixture.hpp | 3 +- .../objective_function/test/spmd_fixture.hpp | 4 +- .../optimizers/test/cg_solver_dpc.cpp | 3 +- .../primitives/optimizers/test/fixture.hpp | 5 +- .../optimizers/test/newton_cg_dpc.cpp | 12 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 212 ++++++++---------- .../dal/backend/primitives/rng/rng_dpc.cpp | 96 ++++---- .../backend/primitives/rng/test/rng_dpc.cpp | 30 +-- .../daal/algorithms/engines/mrg32k3a.rst | 2 +- 17 files changed, 191 insertions(+), 227 deletions(-) diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h index 518d26e01f1..a70c1853e1a 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -17,7 +17,7 @@ /* //++ -// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator // with two components of order 3, optimized for batch processing. //-- */ diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h index 8d697dfd72a..8fdc58b98c8 100644 --- a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h @@ -17,7 +17,7 @@ /* //++ -// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator // with two components of order 3, optimized for batch processing. //-- */ diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h index e57798be50a..3a5d0e33180 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h @@ -17,7 +17,7 @@ /* //++ -// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) // that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- */ diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h index 778b81f4ec9..0c0a92c9b3a 100644 --- a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h @@ -17,7 +17,7 @@ /* //++ -// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) // that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. //-- */ diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index bdda9048082..439fa4665d5 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -91,8 +91,11 @@ std::int32_t most_frequent_element(const std::atomic *components, std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); dal::backend::primitives::host_engine eng; - dal::backend::primitives::rng rn_gen; - rn_gen.uniform_cpu(samples_count, rnd_vertex_ids, eng, 0, vertex_count); + dal::backend::primitives::uniform_cpu(samples_count, + rnd_vertex_ids, + eng, + 0, + vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index c846c77a38c..cd6659d9814 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -396,14 +396,13 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const node_list_ptr = node_list_host.get_mutable_data(); for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - pr::rng rn_gen; Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - rn_gen.uniform_cpu(ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_method[engine_offset + node_idx], - 0, - ctx.row_total_count_); + pr::uniform_cpu(ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_method[engine_offset + node_idx], + 0, + ctx.row_total_count_); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; @@ -483,11 +482,10 @@ train_kernel_hist_impl::gen_feature_list( auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_); - pr::rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform_without_replacement_cpu( + pr::uniform_without_replacement_cpu( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, @@ -524,7 +522,6 @@ train_kernel_hist_impl::gen_random_thresholds( auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_); - pr::rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); // Create arrays for random generated bins @@ -537,11 +534,11 @@ train_kernel_hist_impl::gen_random_thresholds( // Generate random bins for selected features for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform_cpu(ctx.selected_ftr_count_, - random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_method[tree_map_ptr[node]], - 0.0f, - 1.0f); + pr::uniform_cpu(ctx.selected_ftr_count_, + random_bins_host_ptr + node * ctx.selected_ftr_count_, + rng_engine_method[tree_map_ptr[node]], + 0.0f, + 1.0f); } auto event_rnd_generate = random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count()); @@ -1660,12 +1657,10 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); - pr::rng rn_gen; - for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle_cpu(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block]); + pr::shuffle_cpu(oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block]); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index bd5773ff093..ecd49784378 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -124,7 +124,6 @@ struct louvain_data { value_type m; host_engine eng; - rng rn_gen; const std::int64_t vertex_count; const std::int64_t edge_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index e758d769a01..70ceb84ac6e 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); + uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index 03f751e570b..6a1247a67c4 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -572,13 +572,12 @@ class logloss_test : public te::float_algo_fixture rn_gen; auto vec_host = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { primitives::host_engine eng(2007 + dim * num_checks + ij); - rn_gen.uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); + pr::uniform_cpu(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index 985d863b1f5..cf3a2426dd6 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test { std::int64_t num_checks = 5; std::vector> vecs_host(num_checks), vecs_gpu(num_checks); - rng rn_gen; + for (std::int64_t ij = 0; ij < num_checks; ++ij) { host_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); - rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); + uniform_cpu(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue()); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index c912a3a99d2..27af73de1e9 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -43,9 +43,8 @@ class cg_solver_test : public te::float_algo_fixture { x_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; primitives::host_engine eng(4014 + n_); - rn_gen.uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index 120f65f61f0..e941c971302 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -133,11 +133,10 @@ void create_stable_matrix(sycl::queue& queue, ONEDAL_ASSERT(A.get_dimension(1) == n); auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); - primitives::rng rn_gen; primitives::host_engine eng(2007 + n); - rn_gen.uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0); - rn_gen.uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); + primitives::uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index d7414924bf6..a4c0c1ebed3 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host); auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); - primitives::rng rn_gen; + primitives::host_engine eng(2007 + n); - rn_gen.uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); - rn_gen.uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); + primitives::rnguniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + primitives::rnguniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -142,9 +142,9 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_, n_ }, sycl::usm::alloc::host); solution_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; + primitives::engine eng(4014 + n_); - rn_gen.uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); + uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); + uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index cb235b9e4c3..b935d338ff9 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -25,128 +25,110 @@ #endif namespace oneapi::dal::backend::primitives { -template -class rng { -public: - rng() = default; - ~rng() = default; - - template - void uniform_cpu(Size count, Type* dst, host_engine host_engine, Type a, Type b) { - auto state = host_engine.get_host_engine_state(); - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } -#ifdef ONEDAL_DATA_PARALLEL - template - void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { - if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == - sycl::usm::alloc::device) { - throw domain_error(dal::detail::error_messages::unsupported_data_type()); - } - auto state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +template +void uniform_cpu(Size count, Type* dst, host_engine& host_engine, Type a, Type b) { + auto state = host_engine.get_host_engine_state(); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +} +template +void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + host_engine host_engine, + Type a, + Type b) { + auto state = host_engine.get_host_engine_state(); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +template >> +void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { + Type idx[2]; + auto state = host_engine.get_host_engine_state(); + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); } -#endif +} - template - void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - host_engine host_engine, - Type a, - Type b) { - auto state = host_engine.get_host_engine_state(); - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } #ifdef ONEDAL_DATA_PARALLEL - template - void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b) { - if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == - sycl::usm::alloc::device) { - throw domain_error(dal::detail::error_messages::unsupported_data_type()); - } - void* state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); +template +void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); } -#endif - - template >> - void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { - Type idx[2]; - auto state = host_engine.get_host_engine_state(); - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } + auto state = engine_.get_host_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +} + +template +void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); } - -#ifdef ONEDAL_DATA_PARALLEL - template >> - void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { - Type idx[2]; - if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == - sycl::usm::alloc::device) { - throw domain_error(dal::detail::error_messages::unsupported_data_type()); - } - void* state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } + void* state = engine_.get_host_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +template >> +void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { + Type idx[2]; + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); } -#endif + void* state = engine_.get_host_engine_state(); + engine_.skip_ahead_gpu(count); -#ifdef ONEDAL_DATA_PARALLEL - template - void uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); - - template - void uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); - - template - void shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - const event_vector& deps = {}); + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } +} + +template +void uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + +template +void uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + +template +void shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps = {}); #endif -}; -}; // namespace oneapi::dal::backend::primitives \ No newline at end of file +}; // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index cb550a50775..ec586ec1697 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -22,15 +22,14 @@ namespace oneapi::dal::backend::primitives { namespace bk = oneapi::dal::backend; -template -template -void rng::uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +template +void uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } @@ -41,16 +40,15 @@ void rng::uniform_gpu(sycl::queue& queue, } //Currently only CPU impl -template -template -void rng::uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +template +void uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); @@ -61,13 +59,12 @@ void rng::uniform_without_replacement_gpu(sycl::queue& queue, } //Currently only CPU impl -template -template -void rng::shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - const event_vector& deps) { +template +void shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps) { Type idx[2]; if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { @@ -82,14 +79,14 @@ void rng::shuffle_gpu(sycl::queue& queue, } } -#define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::uniform_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - dpc_engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_method::mt2203) \ @@ -111,16 +108,15 @@ void rng::shuffle_gpu(sycl::queue& queue, INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::uniform_without_replacement_gpu( \ - sycl::queue& queue, \ - Size count_, \ - F* dst, \ - F* buff, \ - dpc_engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform_without_replacement_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); #define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203) \ @@ -142,12 +138,12 @@ INSTANTIATE_FLOAT_(std::int32_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); -#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::shuffle_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - dpc_engine& engine_, \ - const event_vector& deps); +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void shuffle_gpu(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + const event_vector& deps); #define INSTANTIATE_SHUFFLE_FLOAT(Size) \ INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203) \ diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 244ac91c72e..3a96d6780c1 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -68,11 +68,6 @@ class rng_test : public te::policy_fixture { using EngineType = std::tuple_element_t<1, TestType>; static constexpr auto engine_qq = engine_v; - auto get_rng() const { - rng rn_gen; - return rn_gen; - } - auto get_host_engine(std::int64_t seed) { auto rng_engine = host_engine(seed); return rng_engine; @@ -109,7 +104,7 @@ class rng_test : public te::policy_fixture { } }; -using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); +using rng_types = COMBINE_TYPES((float), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { SKIP_IF(this->get_policy().is_cpu()); @@ -121,12 +116,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rn_gen = this->get_rng(); auto rng_engine = this->get_engine(seed); auto rng_engine_ = this->get_engine(seed); - rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); - rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); this->check_results(arr_gpu, arr_host); } @@ -174,15 +168,14 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rn_gen = this->get_rng(); auto rng_engine = this->get_engine(seed); auto rng_engine_2 = this->get_engine(seed); - rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); - rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); - rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_host_init_1, arr_host_init_2); this->check_results(arr_gpu, arr_host); @@ -204,25 +197,24 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rn_gen = this->get_rng(); auto rng_engine = this->get_engine(seed); auto rng_engine_2 = this->get_engine(seed); - rn_gen.uniform_gpu(this->get_queue(), + uniform_gpu(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); - rn_gen.uniform_gpu(this->get_queue(), + uniform_gpu(this->get_queue(), elem_count, arr_device_init_2_ptr, rng_engine_2, 0, elem_count); - rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_device_init_1, arr_device_init_2); this->check_results(arr_gpu, arr_host); diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst index 1da1fd7bc15..3d32c4532fb 100644 --- a/docs/source/daal/algorithms/engines/mrg32k3a.rst +++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst @@ -1,5 +1,5 @@ .. ****************************************************************************** -.. * Copyright 2020 Intel Corporation +.. * Copyright contributors to the oneDAL project .. * .. * Licensed under the Apache License, Version 2.0 (the "License"); .. * you may not use this file except in compliance with the License. From 76967f3cacfe591d3ebba53dc1780711899626eb Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 17 Dec 2024 02:29:58 -0800 Subject: [PATCH 13/18] minor fixes --- .../engines/mcg59/mcg59_batch_impl.h | 7 +- .../engines/mrg32k3a/mrg32k3a_batch_impl.h | 7 +- .../engines/mt19937/mt19937_batch_impl.h | 7 +- .../engines/mt2203/mt2203_batch_impl.h | 7 +- .../philox4x32x10/philox4x32x10_batch_impl.h | 7 +- cpp/daal/src/externals/service_rng_mkl.h | 4 ++ cpp/daal/src/externals/service_rng_openrng.h | 4 ++ cpp/daal/src/externals/service_rng_ref.h | 16 +++-- .../dal/backend/primitives/rng/rng_dpc.cpp | 2 +- .../backend/primitives/rng/test/rng_dpc.cpp | 2 +- .../daal/algorithms/engines/philox4x32x10.rst | 64 +++++++++++++++++++ 11 files changed, 94 insertions(+), 33 deletions(-) create mode 100644 docs/source/daal/algorithms/engines/philox4x32x10.rst diff --git a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h index 6c3040da615..62f337ba9a0 100644 --- a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h @@ -26,9 +26,6 @@ #include "src/externals/service_rng.h" #include "src/data_management/service_numeric_table.h" -static const int leapfrogMethodErrcode = -1002; -static const int skipAheadMethodErrcode = -1003; - namespace daal { namespace algorithms @@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batchleapfrog(threadNum, nThreads); services::Status s; - if (errcode == leapfrogMethodErrcode) + if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED) s.add(ErrorLeapfrogUnsupported); else if (errcode) s.add(ErrorIncorrectErrorcodeFromGenerator); @@ -199,7 +196,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::BatchskipAhead(nSkip); services::Status s; - if (errcode == skipAheadMethodErrcode) + if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED) s.add(ErrorSkipAheadUnsupported); else if (errcode) s.add(ErrorIncorrectErrorcodeFromGenerator); diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h index 58e28eb47bf..1f7b40526ac 100644 --- a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -26,9 +26,6 @@ #include "src/externals/service_rng.h" #include "src/data_management/service_numeric_table.h" -static const int leapfrogMethodErrcode = -1002; -static const int skipAheadMethodErrcode = -1003; - namespace daal { namespace algorithms @@ -68,7 +65,7 @@ class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch Date: Tue, 17 Dec 2024 03:46:58 -0800 Subject: [PATCH 14/18] fixes --- .../backend/gpu/train_kernel_hist_impl.hpp | 14 ++-- .../gpu/train_kernel_hist_impl_dpc.cpp | 16 ++-- .../optimizers/test/newton_cg_dpc.cpp | 4 +- .../dal/backend/primitives/rng/dpc_engine.hpp | 2 +- .../backend/primitives/rng/host_engine.hpp | 6 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 12 +-- .../dal/backend/primitives/rng/rng_dpc.cpp | 77 +++++++++--------- .../backend/primitives/rng/test/rng_dpc.cpp | 81 +++++++------------ .../daal/algorithms/engines/mrg32k3a.rst | 28 +++---- .../daal/algorithms/engines/philox4x32x10.rst | 28 +++---- 10 files changed, 123 insertions(+), 145 deletions(-) diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index ac04f73d89f..f677e69b615 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -51,7 +51,7 @@ class train_kernel_hist_impl { using train_context_t = train_context; using imp_data_t = impurity_data; using rng_engine_t = pr::host_engine; - using rng_engine_method_t = std::vector; + using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; using node_t = node; @@ -79,7 +79,7 @@ class train_kernel_hist_impl { Index class_count) const; sycl::event gen_initial_tree_order(train_context_t& ctx, - rng_engine_method_t& rng_engine_method, + rng_engine_list_t& rng_engine_list, pr::ndarray& node_list, pr::ndarray& tree_order_level, Index engine_offset, @@ -115,12 +115,12 @@ class train_kernel_hist_impl { /// @param[in] ctx a training context structure for a GPU backend /// @param[in] node_count number of nodes on the current level /// @param[in] node_vs_tree_map an initial tree order - /// @param[in] rng_engine_method a list of random generator engines + /// @param[in] rng_engine_list a list of random generator engines std::tuple, sycl::event> gen_feature_list( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_method_t& rng_engine_method); + rng_engine_list_t& rng_engine_list); /// Generates random thresholds for each node and for each selected feature for node. /// Thresholds are used for a random splitter kernel to split each node. @@ -129,12 +129,12 @@ class train_kernel_hist_impl { /// @param[in] ctx a training context structure for a GPU backend /// @param[in] node_count number of nodes on the current level /// @param[in] node_vs_tree_map an initial tree order - /// @param[in] rng_engine_method a list of random generator engines + /// @param[in] rng_engine_list a list of random generator engines std::tuple, sycl::event> gen_random_thresholds( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_method_t& rng_engine_method); + rng_engine_list_t& rng_engine_list); /// Computes initial impurity for each node. /// @@ -575,7 +575,7 @@ class train_kernel_hist_impl { pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_method_t& rng_engine_arr, + const rng_engine_list_t& rng_engine_arr, Index tree_idx, Index tree_in_block, Index built_tree_count, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index cd6659d9814..0e6d4bc3a36 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -368,7 +368,7 @@ void train_kernel_hist_impl::allocate_buffers(const tra template sycl::event train_kernel_hist_impl::gen_initial_tree_order( train_context_t& ctx, - rng_engine_method_t& rng_engine_method, + rng_engine_list_t& rng_engine_list, pr::ndarray& node_list_host, pr::ndarray& tree_order_level, Index engine_offset, @@ -400,7 +400,7 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; pr::uniform_cpu(ctx.selected_row_total_count_, gen_row_idx_global_ptr, - rng_engine_method[engine_offset + node_idx], + rng_engine_list[engine_offset + node_idx], 0, ctx.row_total_count_); @@ -464,7 +464,7 @@ train_kernel_hist_impl::gen_feature_list( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map_list, - rng_engine_method_t& rng_engine_method) { + rng_engine_list_t& rng_engine_list) { ONEDAL_PROFILER_TASK(gen_feature_list, queue_); ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count); @@ -489,7 +489,7 @@ train_kernel_hist_impl::gen_feature_list( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_method[tree_map_ptr[node]], + rng_engine_list[tree_map_ptr[node]], 0, ctx.column_count_); } @@ -515,7 +515,7 @@ train_kernel_hist_impl::gen_random_thresholds( const train_context_t& ctx, Index node_count, const pr::ndarray& node_vs_tree_map, - rng_engine_method_t& rng_engine_method) { + rng_engine_list_t& rng_engine_list) { ONEDAL_PROFILER_TASK(gen_random_thresholds, queue_); ONEDAL_ASSERT(node_vs_tree_map.get_count() == node_count); @@ -536,7 +536,7 @@ train_kernel_hist_impl::gen_random_thresholds( for (Index node = 0; node < node_count; ++node) { pr::uniform_cpu(ctx.selected_ftr_count_, random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_method[tree_map_ptr[node]], + rng_engine_list[tree_map_ptr[node]], 0.0f, 1.0f); } @@ -1610,7 +1610,7 @@ sycl::event train_kernel_hist_impl::compute_results( pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_method_t& engine_arr, + const rng_engine_list_t& engine_arr, Index tree_idx_in_block, Index tree_in_block_count, Index built_tree_count, @@ -1854,7 +1854,7 @@ train_result train_kernel_hist_impl::operator()( de::check_mul_overflow((ctx.tree_count_ - 1), skip_num); pr::engine_collection collection(ctx.tree_count_, desc.get_seed()); - rng_engine_method_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { + rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { skip = i * skip_num; }); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index a4c0c1ebed3..bce7df11d0e 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -58,8 +58,8 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); primitives::host_engine eng(2007 + n); - primitives::rnguniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); - primitives::rnguniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); + primitives::uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + primitives::uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp index 1f13975e8d6..164a1578490 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp @@ -88,7 +88,7 @@ class dpc_engine { } void skip_ahead_gpu(size_t nSkip) { - // Will be fixed in the next oneMKL release. + // Will be supported in the next oneMKL release. if constexpr (EngineType == engine_method::mt2203) { } else { diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp index 436e032e608..36779186413 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp @@ -16,12 +16,14 @@ #pragma once +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/rng/utils.hpp" + #include #include #include -#include "oneapi/dal/backend/primitives/rng/rng.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" + namespace oneapi::dal::backend::primitives { template diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index b935d338ff9..b55bfa517a7 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -31,6 +31,7 @@ void uniform_cpu(Size count, Type* dst, host_engine& host_engine, Ty auto state = host_engine.get_host_engine_state(); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } + template void uniform_without_replacement_cpu(Size count, Type* dst, @@ -48,8 +49,8 @@ template >> void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { - Type idx[2]; auto state = host_engine.get_host_engine_state(); + Type idx[2]; for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); @@ -64,8 +65,8 @@ void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, throw domain_error(dal::detail::error_messages::unsupported_data_type()); } auto state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + engine_.skip_ahead_gpu(count); } template @@ -80,8 +81,8 @@ void uniform_without_replacement_cpu(Size count, throw domain_error(dal::detail::error_messages::unsupported_data_type()); } void* state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); + engine_.skip_ahead_gpu(count); } template >> void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { - Type idx[2]; if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } + Type idx[2]; void* state = engine_.get_host_engine_state(); - engine_.skip_ahead_gpu(count); - for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); } + engine_.skip_ahead_gpu(count); } template diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 6fc39290f96..1c162ca0e5b 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -88,23 +88,22 @@ void shuffle_gpu(sycl::queue& queue, F b, \ const event_vector& deps); -#define INSTANTIATE_FLOAT_(Size) \ - INSTANTIATE_(float, Size, engine_method::mt2203) \ - INSTANTIATE_(float, Size, engine_method::mcg59) \ - INSTANTIATE_(float, Size, engine_method::mrg32k3a) \ - INSTANTIATE_(float, Size, engine_method::philox4x32x10) \ - INSTANTIATE_(float, Size, engine_method::mt19937) \ - INSTANTIATE_(double, Size, engine_method::mt2203) \ - INSTANTIATE_(double, Size, engine_method::mcg59) \ - INSTANTIATE_(double, Size, engine_method::mrg32k3a) \ - INSTANTIATE_(double, Size, engine_method::philox4x32x10) \ - INSTANTIATE_(double, Size, engine_method::mt19937) \ - INSTANTIATE_(int, Size, engine_method::mt2203) \ - INSTANTIATE_(int, Size, engine_method::mcg59) \ - INSTANTIATE_(int, Size, engine_method::mrg32k3a) \ - INSTANTIATE_(int, Size, engine_method::philox4x32x10) \ - INSTANTIATE_(int, Size, engine_method::mt19937) - +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_method::mt2203) \ + INSTANTIATE_(float, Size, engine_method::mcg59) \ + INSTANTIATE_(float, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(float, Size, engine_method::mt19937) \ + INSTANTIATE_(double, Size, engine_method::mt2203) \ + INSTANTIATE_(double, Size, engine_method::mcg59) \ + INSTANTIATE_(double, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(double, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(double, Size, engine_method::mt19937) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mt2203) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mcg59) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(std::int32_t, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mt19937) INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); @@ -118,22 +117,22 @@ INSTANTIATE_FLOAT_(std::int32_t); F b, \ const event_vector& deps); -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_method::mt19937) +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mrg32k3a) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::philox4x32x10) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt19937) INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); @@ -145,12 +144,12 @@ INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); dpc_engine& engine_, \ const event_vector& deps); -#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ - INSTANTIATE_SHUFFLE(int, Size, engine_method::mt2203) \ - INSTANTIATE_SHUFFLE(int, Size, engine_method::mcg59) \ - INSTANTIATE_SHUFFLE(int, Size, engine_method::mrg32k3a) \ - INSTANTIATE_SHUFFLE(int, Size, engine_method::philox4x32x10) \ - INSTANTIATE_SHUFFLE(int, Size, engine_method::mt19937) +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mcg59) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mrg32k3a) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::philox4x32x10) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt19937) INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 466ce350e9b..7e07b65f411 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -66,28 +66,26 @@ class rng_test : public te::policy_fixture { public: using DataType = std::tuple_element_t<0, TestType>; using EngineType = std::tuple_element_t<1, TestType>; - static constexpr auto engine_qq = engine_v; + static constexpr auto engine_test_type = engine_v; auto get_host_engine(std::int64_t seed) { - auto rng_engine = host_engine(seed); + auto rng_engine = host_engine(seed); return rng_engine; } - auto get_engine(std::int64_t seed) { - auto rng_engine = dpc_engine(this->get_queue(), seed); + auto get_dpc_engine(std::int64_t seed) { + auto rng_engine = dpc_engine(this->get_queue(), seed); return rng_engine; } auto allocate_array_host(std::int64_t elem_count) { auto arr_host = ndarray::empty({ elem_count }); - return arr_host; } auto allocate_array_device(std::int64_t elem_count) { auto& q = this->get_queue(); auto arr_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); - return arr_gpu; } @@ -99,15 +97,18 @@ class rng_test : public te::policy_fixture { const DataType* val_arr_2_host_ptr = arr_2_host.get_data(); for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) { - REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 1); + // Due to MKL inside generates floats on GPU and doubles on CPU, it makes sense to add minor eps. + REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 0.1); } } }; -using rng_types = COMBINE_TYPES((float), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); +using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); std::int64_t seed = GENERATE_COPY(777, 999); @@ -116,44 +117,22 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rng_engine = this->get_engine(seed); - auto rng_engine_ = this->get_engine(seed); + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_ = this->get_dpc_engine(seed); - uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); this->check_results(arr_gpu, arr_host); } -using rng_types_skip_ahead_support = COMBINE_TYPES((float), +using rng_types_skip_ahead_support = COMBINE_TYPES((float, double), (mt19937, mcg59, mrg32k3a, philox4x32x10)); -// //Just for perf tests -// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip_ahead_support) { -// SKIP_IF(this->get_policy().is_cpu()); -// std::int64_t elem_count = GENERATE_COPY(10000); -// std::int64_t seed = GENERATE_COPY(777); - -// auto arr_host = this->allocate_array_host(elem_count); -// auto arr_host_ptr_ = arr_host.get_mutable_data(); - -// auto arr_host_fake = this->allocate_array_host(1); -// auto arr_host_ptr_fake = arr_host_fake.get_mutable_data(); -// auto rn_gen_ = this->get_rng(); -// auto rng_engine_1 = this->get_engine(seed); - -// BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { -// rn_gen_.uniform_without_replacement_cpu(elem_count, -// arr_host_ptr_, -// arr_host_ptr_fake, -// rng_engine_1, -// 0, -// elem_count); -// }; -// } - TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); std::int64_t seed = GENERATE_COPY(777, 999); @@ -168,14 +147,14 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rng_engine = this->get_engine(seed); - auto rng_engine_2 = this->get_engine(seed); + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_2 = this->get_dpc_engine(seed); - uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_host_init_1, arr_host_init_2); this->check_results(arr_gpu, arr_host); @@ -183,6 +162,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); std::int64_t seed = GENERATE_COPY(1, 777, 999); @@ -197,24 +178,24 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe auto arr_gpu_ptr = arr_gpu.get_mutable_data(); auto arr_host_ptr = arr_host.get_mutable_data(); - auto rng_engine = this->get_engine(seed); - auto rng_engine_2 = this->get_engine(seed); + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_2 = this->get_dpc_engine(seed); - uniform_gpu(this->get_queue(), + uniform_gpu(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); - uniform_gpu(this->get_queue(), + uniform_gpu(this->get_queue(), elem_count, arr_device_init_2_ptr, rng_engine_2, 0, elem_count); - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_device_init_1, arr_device_init_2); this->check_results(arr_gpu, arr_host); @@ -228,7 +209,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe // engine_collection collection(this->get_queue(), 2, seed); -// auto engine_arr = collection.get_engines(); +// auto engine_arr = collection.get_dpc_engines(); // auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count); diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst index 3d32c4532fb..e931c801890 100644 --- a/docs/source/daal/algorithms/engines/mrg32k3a.rst +++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst @@ -1,18 +1,16 @@ -.. ****************************************************************************** -.. * Copyright contributors to the oneDAL project -.. * -.. * Licensed under the Apache License, Version 2.0 (the "License"); -.. * you may not use this file except in compliance with the License. -.. * You may obtain a copy of the License at -.. * -.. * http://www.apache.org/licenses/LICENSE-2.0 -.. * -.. * Unless required by applicable law or agreed to in writing, software -.. * distributed under the License is distributed on an "AS IS" BASIS, -.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -.. * See the License for the specific language governing permissions and -.. * limitations under the License. -.. *******************************************************************************/ +.. Copyright contributors to the oneDAL project +.. +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. mrg32k3a ======== diff --git a/docs/source/daal/algorithms/engines/philox4x32x10.rst b/docs/source/daal/algorithms/engines/philox4x32x10.rst index 83e1a4a2c3f..ac50ea80fdb 100644 --- a/docs/source/daal/algorithms/engines/philox4x32x10.rst +++ b/docs/source/daal/algorithms/engines/philox4x32x10.rst @@ -1,18 +1,16 @@ -.. ****************************************************************************** -.. * Copyright contributors to the oneDAL project -.. * -.. * Licensed under the Apache License, Version 2.0 (the "License"); -.. * you may not use this file except in compliance with the License. -.. * You may obtain a copy of the License at -.. * -.. * http://www.apache.org/licenses/LICENSE-2.0 -.. * -.. * Unless required by applicable law or agreed to in writing, software -.. * distributed under the License is distributed on an "AS IS" BASIS, -.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -.. * See the License for the specific language governing permissions and -.. * limitations under the License. -.. *******************************************************************************/ +.. Copyright contributors to the oneDAL project +.. +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. philox4x32x10 ============= From 06d9f821da1ebc2f2f69399e0c80fc8fa1fc45e5 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 17 Dec 2024 06:37:48 -0800 Subject: [PATCH 15/18] add comments and minor renaming --- .../vertex_partitioning_default_kernel.hpp | 10 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 28 ++-- .../vertex_partitioning_default_kernel.hpp | 2 +- .../objective_function/test/fixture.hpp | 2 +- .../objective_function/test/spmd_fixture.hpp | 2 +- .../optimizers/test/cg_solver_dpc.cpp | 2 +- .../primitives/optimizers/test/fixture.hpp | 4 +- .../optimizers/test/newton_cg_dpc.cpp | 8 +- .../dal/backend/primitives/rng/dpc_engine.hpp | 15 ++ .../backend/primitives/rng/host_engine.hpp | 11 ++ cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 72 ++++---- .../dal/backend/primitives/rng/rng_dpc.cpp | 157 +++++++++++------- .../backend/primitives/rng/test/rng_dpc.cpp | 37 ++--- 13 files changed, 200 insertions(+), 150 deletions(-) diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 439fa4665d5..bdcc3f1487a 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -91,11 +91,11 @@ std::int32_t most_frequent_element(const std::atomic *components, std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); dal::backend::primitives::host_engine eng; - dal::backend::primitives::uniform_cpu(samples_count, - rnd_vertex_ids, - eng, - 0, - vertex_count); + dal::backend::primitives::uniform(samples_count, + rnd_vertex_ids, + eng, + 0, + vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 0e6d4bc3a36..21a9cc440d0 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -398,11 +398,11 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or for (Index node_idx = 0; node_idx < node_count; ++node_idx) { Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - pr::uniform_cpu(ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx], - 0, - ctx.row_total_count_); + pr::uniform(ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_list[engine_offset + node_idx], + 0, + ctx.row_total_count_); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; @@ -485,7 +485,7 @@ train_kernel_hist_impl::gen_feature_list( auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { for (Index node = 0; node < node_count; ++node) { - pr::uniform_without_replacement_cpu( + pr::uniform_without_replacement( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, @@ -534,11 +534,11 @@ train_kernel_hist_impl::gen_random_thresholds( // Generate random bins for selected features for (Index node = 0; node < node_count; ++node) { - pr::uniform_cpu(ctx.selected_ftr_count_, - random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]], - 0.0f, - 1.0f); + pr::uniform(ctx.selected_ftr_count_, + random_bins_host_ptr + node * ctx.selected_ftr_count_, + rng_engine_list[tree_map_ptr[node]], + 0.0f, + 1.0f); } auto event_rnd_generate = random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count()); @@ -1658,9 +1658,9 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - pr::shuffle_cpu(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block]); + pr::shuffle(oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block]); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index 70ceb84ac6e..e287c3f2f66 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - uniform_cpu(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); + uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index 6a1247a67c4..6cf2b73ccd6 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); + pr::uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index cf3a2426dd6..63ab0a07c13 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -105,7 +105,7 @@ class logloss_spmd_test : public logloss_test { host_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); - uniform_cpu(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); + uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue()); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index 27af73de1e9..b529836f70e 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -44,7 +44,7 @@ class cg_solver_test : public te::float_algo_fixture { b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::host_engine eng(4014 + n_); - primitives::uniform_cpu(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index e941c971302..c188c50983c 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -135,8 +135,8 @@ void create_stable_matrix(sycl::queue& queue, auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); primitives::host_engine eng(2007 + n); - primitives::uniform_cpu(n * n, J.get_mutable_data(), eng, -1.0, 1.0); - primitives::uniform_cpu(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); + primitives::uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index bce7df11d0e..b2ebe9f5bdb 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -58,8 +58,8 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); primitives::host_engine eng(2007 + n); - primitives::uniform_cpu(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); - primitives::uniform_cpu(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); + primitives::uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + primitives::uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture { auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::engine eng(4014 + n_); - uniform_cpu(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); + uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - uniform_cpu(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); + uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp index 164a1578490..9b9745f4cfa 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp @@ -53,6 +53,21 @@ struct dpc_engine_type { using type = oneapi::mkl::rng::philox4x32x10; }; +/// A class that provides a unified interface for random number generation on both CPU and GPU devices. +/// +/// This class serves as a wrapper for random number generators (RNGs) that supports different engine types, +/// enabling efficient random number generation on heterogeneous platforms using SYCL. It integrates a host +/// (CPU) engine and a device (GPU) engine, allowing operations to be executed seamlessly on the appropriate +/// device. +/// +/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`. +/// +/// @param[in] queue The SYCL queue used to manage device operations. +/// @param[in] seed The initial seed for the random number generator. Defaults to `777`. +/// +/// The class provides functionality to skip ahead in the RNG sequence, retrieve engine states, and +/// manage host and device engines independently. Support for `skip_ahead` on GPU is currently limited for +/// some engine types. template class dpc_engine { public: diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp index 36779186413..c4b2c807674 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp @@ -26,6 +26,17 @@ namespace oneapi::dal::backend::primitives { +/// A class that provides an interface for random number generation on the host (CPU) only. +/// +/// This class serves as a wrapper for host-based random number generators (RNGs), supporting multiple engine +/// types for flexible and efficient random number generation on CPU. It abstracts the underlying engine +/// implementation and provides an interface to manage and retrieve the engine's state. +/// +/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`. +/// +/// @param[in] seed The initial seed for the random number generator. Defaults to `777`. +/// +/// @note The class only supports host-based RNG and does not require a SYCL queue or device context. template class host_engine { public: diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index b55bfa517a7..83125ba73e7 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -27,18 +27,18 @@ namespace oneapi::dal::backend::primitives { template -void uniform_cpu(Size count, Type* dst, host_engine& host_engine, Type a, Type b) { +void uniform(Size count, Type* dst, host_engine& host_engine, Type a, Type b) { auto state = host_engine.get_host_engine_state(); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } template -void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - host_engine host_engine, - Type a, - Type b) { +void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + host_engine host_engine, + Type a, + Type b) { auto state = host_engine.get_host_engine_state(); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); } @@ -48,7 +48,7 @@ template >> -void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { +void shuffle(Size count, Type* dst, host_engine host_engine) { auto state = host_engine.get_host_engine_state(); Type idx[2]; for (Size i = 0; i < count; ++i) { @@ -59,7 +59,7 @@ void shuffle_cpu(Size count, Type* dst, host_engine host_engine) { #ifdef ONEDAL_DATA_PARALLEL template -void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { +void uniform(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); @@ -70,12 +70,12 @@ void uniform_cpu(Size count, Type* dst, dpc_engine& engine_, Type a, } template -void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b) { +void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); @@ -90,7 +90,7 @@ template >> -void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { +void shuffle(Size count, Type* dst, dpc_engine& engine_) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); @@ -105,30 +105,30 @@ void shuffle_cpu(Size count, Type* dst, dpc_engine& engine_) { } template -void uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); +void uniform(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); template -void uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); +void uniform_without_replacement(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); template -void shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - const event_vector& deps = {}); +void shuffle(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps = {}); #endif }; // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 1c162ca0e5b..4ad09c4cc99 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -23,13 +23,13 @@ namespace oneapi::dal::backend::primitives { namespace bk = oneapi::dal::backend; template -void uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +void uniform(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); } @@ -41,14 +41,14 @@ void uniform_gpu(sycl::queue& queue, //Currently only CPU impl template -void uniform_without_replacement_gpu(sycl::queue& queue, - Size count, - Type* dst, - Type* buffer, - dpc_engine& engine_, - Type a, - Type b, - const event_vector& deps) { +void uniform_without_replacement(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { throw domain_error(dal::detail::error_messages::unsupported_data_type()); @@ -60,11 +60,11 @@ void uniform_without_replacement_gpu(sycl::queue& queue, //Currently only CPU impl template -void shuffle_gpu(sycl::queue& queue, - Size count, - Type* dst, - dpc_engine& engine_, - const event_vector& deps) { +void shuffle(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps) { Type idx[2]; if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::device) { @@ -79,14 +79,14 @@ void shuffle_gpu(sycl::queue& queue, } } -#define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void uniform_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - dpc_engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_method::mt2203) \ @@ -107,42 +107,71 @@ void shuffle_gpu(sycl::queue& queue, INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ - template ONEDAL_EXPORT void uniform_without_replacement_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - F* buff, \ - dpc_engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); - -#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_method::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_method::mt19937) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt2203) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mcg59) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mrg32k3a) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::philox4x32x10) \ - INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(std::int32_t, Size, engine_method::mt19937) - -INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); -INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); - -#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ - template ONEDAL_EXPORT void shuffle_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - dpc_engine& engine_, \ - const event_vector& deps); +#define INSTANTIATE_uniform_without_replacement(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform_without_replacement(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_uniform_without_replacement_FLOAT(Size) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement( \ + float, \ + Size, \ + engine_method::mcg59) INSTANTIATE_uniform_without_replacement(float, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt19937) \ + INSTANTIATE_uniform_without_replacement(double, Size, engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement(double, \ + Size, \ + engine_method::mcg59) \ + INSTANTIATE_uniform_without_replacement(double, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement( \ + double, \ + Size, \ + engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement( \ + double, \ + Size, \ + engine_method::mt19937) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mcg59) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mt19937) + +INSTANTIATE_uniform_without_replacement_FLOAT(std::int64_t); +INSTANTIATE_uniform_without_replacement_FLOAT(std::int32_t); + +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void shuffle(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + const event_vector& deps); #define INSTANTIATE_SHUFFLE_FLOAT(Size) \ INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203) \ diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 7e07b65f411..2a079f15466 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -120,8 +120,8 @@ TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { auto rng_engine = this->get_dpc_engine(seed); auto rng_engine_ = this->get_dpc_engine(seed); - uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + uniform(elem_count, arr_host_ptr, rng_engine, 0, elem_count); + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); this->check_results(arr_gpu, arr_host); } @@ -150,11 +150,11 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe auto rng_engine = this->get_dpc_engine(seed); auto rng_engine_2 = this->get_dpc_engine(seed); - uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + uniform(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_host_init_1, arr_host_init_2); this->check_results(arr_gpu, arr_host); @@ -181,21 +181,16 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe auto rng_engine = this->get_dpc_engine(seed); auto rng_engine_2 = this->get_dpc_engine(seed); - uniform_gpu(this->get_queue(), - elem_count, - arr_device_init_1_ptr, - rng_engine, - 0, - elem_count); - uniform_gpu(this->get_queue(), - elem_count, - arr_device_init_2_ptr, - rng_engine_2, - 0, - elem_count); - - uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); - uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + uniform(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); + uniform(this->get_queue(), + elem_count, + arr_device_init_2_ptr, + rng_engine_2, + 0, + elem_count); + + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); this->check_results(arr_device_init_1, arr_device_init_2); this->check_results(arr_gpu, arr_host); From 42e00208e8d3113a39176ace7b1e9ae8db2f0933 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 20 Dec 2024 01:58:17 -0800 Subject: [PATCH 16/18] minor fix --- .../dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index b2ebe9f5bdb..b6151ff180a 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -143,7 +143,7 @@ class newton_cg_test : public te::float_algo_fixture { solution_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::engine eng(4014 + n_); + primitives::host_engine eng(4014 + n_); uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); From 85d9f02321a94c6b893506532dd945e46185cec0 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 20 Dec 2024 03:38:26 -0800 Subject: [PATCH 17/18] fix --- .../dal/backend/primitives/objective_function/test/fixture.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index 6cf2b73ccd6..31870cb645f 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -577,7 +577,7 @@ class logloss_test : public te::float_algo_fixture(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); From 7393a2648d59f62b7e6454388349fbe2edb26bc8 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 20 Dec 2024 06:49:37 -0800 Subject: [PATCH 18/18] minor fix --- cpp/daal/include/daal_win.h | 6 +-- .../backend/primitives/rng/test/rng_dpc.cpp | 41 ++----------------- 2 files changed, 7 insertions(+), 40 deletions(-) diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h index a15ed7db26e..6e86076e275 100755 --- a/cpp/daal/include/daal_win.h +++ b/cpp/daal/include/daal_win.h @@ -309,13 +309,13 @@ #include "algorithms/distributions/bernoulli/bernoulli.h" #include "algorithms/distributions/bernoulli/bernoulli_types.h" #include "algorithms/engines/engine.h" -#include "algorithms/engines/engine_family.h" -#include "algorithms/engines/mt2203/mt2203.h" -#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mt19937/mt19937.h" #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" #include "algorithms/engines/mcg59/mcg59_types.h" +#include "algorithms/engines/engine_family.h" +#include "algorithms/engines/mt2203/mt2203.h" +#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mrg32k3a/mrg32k3a.h" #include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" #include "algorithms/engines/philox4x32x10/philox4x32x10.h" diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 2a079f15466..38c09902046 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -107,6 +107,7 @@ using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { SKIP_IF(this->get_policy().is_cpu()); + SKIP_IF(this->not_float64_friendly()); using Float = std::tuple_element_t<0, TestType>; std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); @@ -131,6 +132,7 @@ using rng_types_skip_ahead_support = COMBINE_TYPES((float, double), TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); + SKIP_IF(this->not_float64_friendly()); using Float = std::tuple_element_t<0, TestType>; std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); @@ -162,6 +164,7 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahe TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) { SKIP_IF(this->get_policy().is_cpu()); + SKIP_IF(this->not_float64_friendly()); using Float = std::tuple_element_t<0, TestType>; std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); @@ -196,42 +199,6 @@ TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahe this->check_results(arr_gpu, arr_host); } -//TODO: add engine collection test + host_engine tests -// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { -// SKIP_IF(this->get_policy().is_cpu()); -// std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); -// std::int64_t seed = GENERATE_COPY(1, 777, 999); - -// engine_collection collection(this->get_queue(), 2, seed); - -// auto engine_arr = collection.get_dpc_engines(); - -// auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count); - -// auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); -// auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); - -// auto rn_gen = this->get_rng(); - -// rn_gen.uniform(this->get_queue(), -// elem_count, -// arr_device_init_1_ptr, -// engine_arr[0], -// 0, -// elem_count); - -// rn_gen.uniform(this->get_queue(), -// elem_count, -// arr_device_init_2_ptr, -// engine_arr[1], -// 0, -// elem_count); - -// // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count); -// // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count); - -// //this->check_results_device(arr_device_init_1, arr_device_init_2); -// this->check_results(arr_device_init_1, arr_device_init_2); -// } +//TODO: add engine collection test + separate host_engine tests } // namespace oneapi::dal::backend::primitives::test