diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt index ebaf1d7b0ef..dadfb9d476e 100644 --- a/dpnp/CMakeLists.txt +++ b/dpnp/CMakeLists.txt @@ -56,6 +56,7 @@ endfunction() build_dpnp_cython_ext_with_backend(dparray ${CMAKE_CURRENT_SOURCE_DIR}/dparray.pyx dpnp) add_subdirectory(backend) +add_subdirectory(backend/extensions/blas) add_subdirectory(backend/extensions/lapack) add_subdirectory(backend/extensions/vm) add_subdirectory(backend/extensions/sycl_ext) diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt new file mode 100644 index 00000000000..d19f60c9792 --- /dev/null +++ b/dpnp/backend/extensions/blas/CMakeLists.txt @@ -0,0 +1,83 @@ +# ***************************************************************************** +# Copyright (c) 2016-2023, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +set(python_module_name _blas_impl) +set(_module_src + ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp +) + +pybind11_add_module(${python_module_name} MODULE ${_module_src}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) + +if (WIN32) + if (${CMAKE_VERSION} VERSION_LESS "3.27") + # this is a work-around for target_link_options inserting option after -link option, cause + # linker to ignore it. + set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel") + endif() +endif() + +set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON) + +target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) +target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src) + +target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) + +if (WIN32) + target_compile_options(${python_module_name} PRIVATE + /clang:-fno-approx-func + /clang:-fno-finite-math-only + ) +else() + target_compile_options(${python_module_name} PRIVATE + -fno-approx-func + -fno-finite-math-only + ) +endif() + +target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel) +if (UNIX) + # this option is support on Linux only + target_link_options(${python_module_name} PUBLIC -fsycl-link-huge-device-code) +endif() + +if (DPNP_GENERATE_COVERAGE) + target_link_options(${python_module_name} PRIVATE -fprofile-instr-generate -fcoverage-mapping) +endif() + +if (MKL_VERSION_2024) + target_link_libraries(${python_module_name} PUBLIC MKL::MKL_SYCL::BLAS) +else() + target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP) +endif() + +install(TARGETS ${python_module_name} + DESTINATION "dpnp/backend/extensions/blas" +) diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp new file mode 100644 index 00000000000..524f16fcc7d --- /dev/null +++ b/dpnp/backend/extensions/blas/blas_py.cpp @@ -0,0 +1,66 @@ +//***************************************************************************** +// Copyright (c) 2023, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +// This file defines functions of dpnp.backend._lapack_impl extensions +// +//***************************************************************************** + +#include +#include + +#include "gemm.hpp" + +namespace blas_ext = dpnp::backend::ext::blas; +namespace py = pybind11; + +// populate dispatch tables +void init_dispatch_tables(void) +{ + blas_ext::init_gemm_batch_dispatch_table(); + blas_ext::init_gemm_dispatch_table(); +} + +PYBIND11_MODULE(_blas_impl, m) +{ + init_dispatch_tables(); + + { + m.def("_gemm", &blas_ext::gemm, + "Call `gemm` from OneMKL LAPACK library to return " + "the matrix-matrix product with 2-D matrices.", + py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"), + py::arg("result"), py::arg("depends") = py::list()); + } + + { + m.def("_gemm_batch", &blas_ext::gemm_batch, + "Call `gemm_batch` from OneMKL LAPACK library to return " + "the matrix-matrix product for a batch of 2-D matrices.", + py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"), + py::arg("result"), py::arg("batch_size"), py::arg("stridea"), + py::arg("strideb"), py::arg("stridec"), + py::arg("depends") = py::list()); + } +} diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp new file mode 100644 index 00000000000..5526ecd3c1b --- /dev/null +++ b/dpnp/backend/extensions/blas/gemm.cpp @@ -0,0 +1,263 @@ +//***************************************************************************** +// Copyright (c) 2023, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "gemm.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace blas +{ +namespace mkl_blas = oneapi::mkl::blas; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*gemm_impl_fn_ptr_t)(sycl::queue, + oneapi::mkl::transpose, + oneapi::mkl::transpose, + const std::int64_t, + const std::int64_t, + const std::int64_t, + char *, + const std::int64_t, + char *, + const std::int64_t, + char *, + const std::int64_t, + const std::vector &); + +static gemm_impl_fn_ptr_t gemm_dispatch_table[dpctl_td_ns::num_types] + [dpctl_td_ns::num_types]; + +template +static sycl::event gemm_impl(sycl::queue exec_q, + oneapi::mkl::transpose transA, + oneapi::mkl::transpose transB, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + char *matrixA, + const std::int64_t lda, + char *matrixB, + const std::int64_t ldb, + char *resultC, + const std::int64_t ldc, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + type_utils::validate_type_for_device(exec_q); + + Tab *a = reinterpret_cast(matrixA); + Tab *b = reinterpret_cast(matrixB); + Tc *res = reinterpret_cast(resultC); + + std::stringstream error_msg; + bool is_exception_caught = false; + + sycl::event gemm_event; + try { + gemm_event = mkl_blas::row_major::gemm( + exec_q, + transA, // Defines the transpose operation for matrix A: + // 'N' indicates no transpose, 'T' for transpose, + // or 'C' for a conjugate transpose. + transB, // Same as transA but for matrix B. + m, // Number of rows in matrices A and C. + n, // Number of columns in matrices B and C. + k, // Number of columns in matrix A and rows in matrix B. + Tab(1), // Scaling factor for the product of matrices A and B. + a, // Pointer to matrix A. + lda, // Leading dimension of matrix A, which is the + // stride between successive rows (for row major + // layout). + b, // Pointer to matrix B. + ldb, // Leading dimension of matrix B, similar to lda. + Tab(0), // Scaling factor for matrix C. + res, // Pointer to matrix C, where the result is stored. + ldc, // Leading dimension of matrix C. + depends); + } catch (oneapi::mkl::exception const &e) { + error_msg + << "Unexpected MKL exception caught during gemm() call:\nreason: " + << e.what(); + is_exception_caught = true; + } catch (sycl::exception const &e) { + error_msg << "Unexpected SYCL exception caught during gemm() call:\n" + << e.what(); + is_exception_caught = true; + } + + if (is_exception_caught) // an unexpected error occurs + { + throw std::runtime_error(error_msg.str()); + } + + return gemm_event; +} + +std::pair + gemm(sycl::queue exec_q, + dpctl::tensor::usm_ndarray matrixA, + dpctl::tensor::usm_ndarray matrixB, + dpctl::tensor::usm_ndarray resultC, + const std::vector &depends) +{ + const int matrixA_nd = matrixA.get_ndim(); + const int matrixB_nd = matrixB.get_ndim(); + const int resultC_nd = resultC.get_ndim(); + + if ((matrixA_nd != 2) || (matrixB_nd != 2) || (resultC_nd != 2)) { + throw py::value_error("The input matrices must be of 2 dimensions."); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(matrixA, resultC)) { + throw py::value_error("Input array 1 and output array are overlapping " + "segments of memory"); + } + if (overlap(matrixB, resultC)) { + throw py::value_error("Input array 2 and output array are overlapping " + "segments of memory"); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible( + exec_q, + {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) + { + throw py::value_error( + "USM allocations are not compatible with the execution queue."); + } + + bool is_matrixA_f_contig = matrixA.is_f_contiguous(); + bool is_matrixB_f_contig = matrixB.is_f_contiguous(); + bool is_matrixA_c_contig = matrixA.is_c_contiguous(); + bool is_matrixB_c_contig = matrixB.is_c_contiguous(); + + if (!is_matrixA_f_contig and !is_matrixA_c_contig) { + throw py::value_error( + "Input array 1 is not c-contiguous nor f-contiguous."); + } + if (!is_matrixB_f_contig and !is_matrixB_c_contig) { + throw py::value_error( + "Input array 2 is not c-contiguous nor f-contiguous."); + } + + const py::ssize_t *a_shape = matrixA.get_shape_raw(); + const py::ssize_t *b_shape = matrixB.get_shape_raw(); + const py::ssize_t *res_shape = resultC.get_shape_raw(); + + if (a_shape[1] != b_shape[0]) { + throw py::value_error("The number of columns in A must be equal to " + "the number of rows in B."); + } + + oneapi::mkl::transpose transA = is_matrixA_f_contig + ? oneapi::mkl::transpose::T + : oneapi::mkl::transpose::N; + oneapi::mkl::transpose transB = is_matrixB_f_contig + ? oneapi::mkl::transpose::T + : oneapi::mkl::transpose::N; + + const std::int64_t m = a_shape[0]; + const std::int64_t n = b_shape[1]; + const std::int64_t k = a_shape[1]; + + const std::int64_t lda = + (transA == oneapi::mkl::transpose::N) ? a_shape[1] : a_shape[0]; + const std::int64_t ldb = + (transB == oneapi::mkl::transpose::N) ? b_shape[1] : b_shape[0]; + const std::int64_t ldc = res_shape[1]; + + int matrixA_typenum = matrixA.get_typenum(); + int matrixB_typenum = matrixB.get_typenum(); + int resultC_typenum = resultC.get_typenum(); + + if (matrixA_typenum != matrixB_typenum) { + throw py::value_error("matrixA and matrixB must be of the same type."); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int matrixAB_type_id = array_types.typenum_to_lookup_id(matrixA_typenum); + int resultC_type_id = array_types.typenum_to_lookup_id(resultC_typenum); + + gemm_impl_fn_ptr_t gemm_fn = + gemm_dispatch_table[matrixAB_type_id][resultC_type_id]; + if (gemm_fn == nullptr) { + throw py::value_error( + "Types of input matrices and result matrix are mismatched."); + } + + char *a_typeless_ptr = matrixA.get_data(); + char *b_typeless_ptr = matrixB.get_data(); + char *r_typeless_ptr = resultC.get_data(); + + sycl::event gemm_ev = + gemm_fn(exec_q, transA, transB, m, n, k, a_typeless_ptr, lda, + b_typeless_ptr, ldb, r_typeless_ptr, ldc, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive( + exec_q, {matrixA, matrixB, resultC}, {gemm_ev}); + + return std::make_pair(args_ev, gemm_ev); +} + +template +struct GemmContigFactory +{ + fnT get() + { + if constexpr (types::GemmTypePairSupportFactory::is_defined) { + return gemm_impl; + } + else { + return nullptr; + } + } +}; + +void init_gemm_dispatch_table(void) +{ + dpctl_td_ns::DispatchTableBuilder + contig; + contig.populate_dispatch_table(gemm_dispatch_table); +} +} // namespace blas +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp new file mode 100644 index 00000000000..25f78b5b850 --- /dev/null +++ b/dpnp/backend/extensions/blas/gemm.hpp @@ -0,0 +1,64 @@ +//***************************************************************************** +// Copyright (c) 2023, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace blas +{ +extern std::pair + gemm(sycl::queue exec_q, + dpctl::tensor::usm_ndarray matrixA, + dpctl::tensor::usm_ndarray matrixB, + dpctl::tensor::usm_ndarray resultC, + const std::vector &depends); + +extern std::pair + gemm_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray matrixA, + dpctl::tensor::usm_ndarray matrixB, + dpctl::tensor::usm_ndarray resultC, + const std::int64_t batch_size, + size_t stridea, + size_t strideb, + size_t stridec, + const std::vector &depends); + +extern void init_gemm_dispatch_table(void); +extern void init_gemm_batch_dispatch_table(void); +} // namespace blas +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp new file mode 100644 index 00000000000..32f592f6b8a --- /dev/null +++ b/dpnp/backend/extensions/blas/gemm_batch.cpp @@ -0,0 +1,253 @@ +//***************************************************************************** +// Copyright (c) 2023, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "gemm.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace blas +{ +namespace mkl_blas = oneapi::mkl::blas; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*gemm_batch_impl_fn_ptr_t)( + sycl::queue, + const std::int64_t, + const std::int64_t, + const std::int64_t, + const std::int64_t, + const std::int64_t, + const std::int64_t, + const std::int64_t, + size_t, + size_t, + size_t, + oneapi::mkl::transpose, + oneapi::mkl::transpose, + char *, + char *, + char *, + const std::vector &); + +static gemm_batch_impl_fn_ptr_t + gemm_batch_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types]; + +template +static sycl::event gemm_batch_impl(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + const std::int64_t batch_size, + const std::int64_t lda, + const std::int64_t ldb, + const std::int64_t ld_result, + size_t stridea, + size_t strideb, + size_t stridec, + oneapi::mkl::transpose transA, + oneapi::mkl::transpose transB, + char *matrixA, + char *matrixB, + char *resultC, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + type_utils::validate_type_for_device(exec_q); + + Tab *a = reinterpret_cast(matrixA); + Tab *b = reinterpret_cast(matrixB); + Tc *res = reinterpret_cast(resultC); + + std::stringstream error_msg; + bool is_exception_caught = false; + + sycl::event gemm_batch_event; + try { + gemm_batch_event = mkl_blas::row_major::gemm_batch( + exec_q, + transA, // Defines the transpose operation for matrix A: + // 'N' indicates no transpose, 'T' for transpose, + // or 'C' for a conjugate transpose. + transB, // Same as transA but for matrix B. + m, // Number of rows in matrices A and C. + n, // Number of columns in matrices B and C. + k, // Number of columns in matrix A and rows in matrix B. + Tab(1), // Scaling factor for the product of matrices A and B. + a, // Pointer to matrix A. + lda, // Leading dimension of matrix A, which is the + // stride between successive rows (for row major + // layout). + stridea, // Stride between different A matrices. + b, // Pointer to matrix B. + ldb, // Leading dimension of matrix B, similar to lda. + strideb, // Stride between different B matrices. + Tab(0), // Scaling factor for matrix C. + res, // Pointer to matrix C, where the result is stored. + ld_result, // Leading dimension of matrix C. + stridec, // Stride between different C matrices. + batch_size, // Specifies the number of matrix multiply operations to + // perform. + depends); + } catch (oneapi::mkl::exception const &e) { + error_msg << "Unexpected MKL exception caught during gemm_batch() " + "call:\nreason: " + << e.what(); + is_exception_caught = true; + } catch (sycl::exception const &e) { + error_msg + << "Unexpected SYCL exception caught during gemm_batch() call:\n" + << e.what(); + is_exception_caught = true; + } + + if (is_exception_caught) // an unexpected error occurs + { + throw std::runtime_error(error_msg.str()); + } + + return gemm_batch_event; +} + +std::pair + gemm_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray matrixA, + dpctl::tensor::usm_ndarray matrixB, + dpctl::tensor::usm_ndarray resultC, + const std::int64_t batch_size, + size_t stridea, + size_t strideb, + size_t stridec, + const std::vector &depends = {}) +{ + if (!dpctl::utils::queues_are_compatible( + exec_q, + {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) + { + throw py::value_error( + "USM allocations are not compatible with the execution queue."); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(matrixA, resultC)) { + throw py::value_error("Input array 1 and output array are overlapping " + "segments of memory"); + } + if (overlap(matrixB, resultC)) { + throw py::value_error("Input array 2 and output array are overlapping " + "segments of memory"); + } + + const int matrixA_nd = matrixA.get_ndim(); + const int matrixB_nd = matrixB.get_ndim(); + const py::ssize_t *a_shape = matrixA.get_shape_raw(); + const py::ssize_t *b_shape = matrixB.get_shape_raw(); + + if (a_shape[matrixA_nd - 1] != b_shape[matrixB_nd - 2]) { + throw py::value_error("The number of columns in A must be equal to " + "the number of rows in B."); + } + + const std::int64_t m = a_shape[matrixA_nd - 2]; + const std::int64_t n = b_shape[matrixB_nd - 1]; + const std::int64_t k = a_shape[matrixA_nd - 1]; + + // transA and transB are always True + oneapi::mkl::transpose transA = oneapi::mkl::transpose::N; + oneapi::mkl::transpose transB = oneapi::mkl::transpose::N; + + int matrixA_typenum = matrixA.get_typenum(); + int matrixB_typenum = matrixB.get_typenum(); + int resultC_typenum = resultC.get_typenum(); + + if (matrixA_typenum != matrixB_typenum) { + throw py::value_error("matrixA and matrixB must be of the same type."); + } + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int matrixAB_type_id = array_types.typenum_to_lookup_id(matrixA_typenum); + int resultC_type_id = array_types.typenum_to_lookup_id(resultC_typenum); + + gemm_batch_impl_fn_ptr_t gemm_batch_fn = + gemm_batch_dispatch_table[matrixAB_type_id][resultC_type_id]; + if (gemm_batch_fn == nullptr) { + throw py::value_error( + "Types of input matrices and result matrix are mismatched."); + } + + char *a_typeless_ptr = matrixA.get_data(); + char *b_typeless_ptr = matrixB.get_data(); + char *r_typeless_ptr = resultC.get_data(); + + // Note that lda = k, ldb = n, and ld_result = n + sycl::event gemm_batch_ev = gemm_batch_fn( + exec_q, m, n, k, batch_size, k, n, n, stridea, strideb, stridec, transA, + transB, a_typeless_ptr, b_typeless_ptr, r_typeless_ptr, depends); + + sycl::event args_batch_ev = dpctl::utils::keep_args_alive( + exec_q, {matrixA, matrixB, resultC}, {gemm_batch_ev}); + + return std::make_pair(args_batch_ev, gemm_batch_ev); +} + +template +struct GemmBatchContigFactory +{ + fnT get() + { + if constexpr (types::GemmBatchTypePairSupportFactory::is_defined) { + return gemm_batch_impl; + } + else { + return nullptr; + } + } +}; + +void init_gemm_batch_dispatch_table(void) +{ + dpctl_td_ns::DispatchTableBuilder + contig; + contig.populate_dispatch_table(gemm_batch_dispatch_table); +} +} // namespace blas +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp new file mode 100644 index 00000000000..49154df03c4 --- /dev/null +++ b/dpnp/backend/extensions/blas/types_matrix.hpp @@ -0,0 +1,109 @@ +//***************************************************************************** +// Copyright (c) 2023, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +// dpctl tensor headers +#include "utils/type_dispatch.hpp" + +// dpctl namespace for operations with types +namespace dpctl_td_ns = dpctl::tensor::type_dispatch; + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace blas +{ +namespace types +{ +/** + * @brief A factory to define pairs of supported types for which + * MKL BLAS library provides support in oneapi::mkl::blas::gemm + * function. + * + * @tparam Tab Type of arrays containing input matrices A and B. + * @tparam Tc Type of array containing output matrix C. + */ +template +struct GemmTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + Tc, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + Tc, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + +/** + * @brief A factory to define pairs of supported types for which + * MKL BLAS library provides support in + * oneapi::mkl::blas::gemm_batch function. + * + * @tparam Tab Type of arrays containing input matrices A and B. + * @tparam Tc Type of array containing output matrix C. + */ +template +struct GemmBatchTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + Tc, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + Tc, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; +} // namespace types +} // namespace blas +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp index 58e3fc10634..3e3f010fa63 100644 --- a/dpnp/backend/include/dpnp_iface_fptr.hpp +++ b/dpnp/backend/include/dpnp_iface_fptr.hpp @@ -190,8 +190,6 @@ enum class DPNPFuncName : size_t DPNP_FN_LOG2, /**< Used in numpy.log2() impl */ DPNP_FN_LOG1P, /**< Used in numpy.log1p() impl */ DPNP_FN_MATMUL, /**< Used in numpy.matmul() impl */ - DPNP_FN_MATMUL_EXT, /**< Used in numpy.matmul() impl, requires extra - parameters */ DPNP_FN_MATRIX_RANK, /**< Used in numpy.linalg.matrix_rank() impl */ DPNP_FN_MATRIX_RANK_EXT, /**< Used in numpy.linalg.matrix_rank() impl, requires extra parameters */ diff --git a/dpnp/backend/kernels/dpnp_krnl_common.cpp b/dpnp/backend/kernels/dpnp_krnl_common.cpp index d575f8bdb96..d8b1038ea74 100644 --- a/dpnp/backend/kernels/dpnp_krnl_common.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_common.cpp @@ -946,26 +946,6 @@ void (*dpnp_matmul_default_c)(void *, const shape_elem_type *) = dpnp_matmul_c<_DataType>; -template -DPCTLSyclEventRef (*dpnp_matmul_ext_c)(DPCTLSyclQueueRef, - void *, - const size_t, - const size_t, - const shape_elem_type *, - const shape_elem_type *, - const void *, - const size_t, - const size_t, - const shape_elem_type *, - const shape_elem_type *, - const void *, - const size_t, - const size_t, - const shape_elem_type *, - const shape_elem_type *, - const DPCTLEventVectorRef) = - dpnp_matmul_c<_DataType>; - void func_map_init_linalg(func_map_t &fmap) { fmap[DPNPFuncName::DPNP_FN_ASTYPE][eft_BLN][eft_BLN] = { @@ -1190,14 +1170,5 @@ void func_map_init_linalg(func_map_t &fmap) fmap[DPNPFuncName::DPNP_FN_MATMUL][eft_DBL][eft_DBL] = { eft_DBL, (void *)dpnp_matmul_default_c}; - fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_INT][eft_INT] = { - eft_INT, (void *)dpnp_matmul_ext_c}; - fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_LNG][eft_LNG] = { - eft_LNG, (void *)dpnp_matmul_ext_c}; - fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_FLT][eft_FLT] = { - eft_FLT, (void *)dpnp_matmul_ext_c}; - fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_DBL][eft_DBL] = { - eft_DBL, (void *)dpnp_matmul_ext_c}; - return; } diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd index cd3694bc004..40a350d53f5 100644 --- a/dpnp/dpnp_algo/dpnp_algo.pxd +++ b/dpnp/dpnp_algo/dpnp_algo.pxd @@ -86,8 +86,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName": # need this na DPNP_FN_INV_EXT DPNP_FN_KRON DPNP_FN_KRON_EXT - DPNP_FN_MATMUL - DPNP_FN_MATMUL_EXT DPNP_FN_MATRIX_RANK DPNP_FN_MATRIX_RANK_EXT DPNP_FN_MAXIMUM @@ -300,8 +298,6 @@ cpdef dpnp_descriptor dpnp_isclose(dpnp_descriptor input1, dpnp_descriptor input Linear algebra """ cpdef dpnp_descriptor dpnp_dot(dpnp_descriptor in_array1, dpnp_descriptor in_array2) -cpdef dpnp_descriptor dpnp_matmul(dpnp_descriptor in_array1, dpnp_descriptor in_array2, dpnp_descriptor out=*) - """ Array creation routines diff --git a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi index 6614399f182..0691a543bab 100644 --- a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi +++ b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi @@ -39,7 +39,6 @@ __all__ += [ "dpnp_dot", "dpnp_inner", "dpnp_kron", - "dpnp_matmul", ] @@ -56,14 +55,6 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_dot_t)(c_dpctl.DPCTLSyclQueueR void * , const size_t, const size_t, const shape_elem_type *, const shape_elem_type * , const c_dpctl.DPCTLEventVectorRef) except + -ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_matmul_t)(c_dpctl.DPCTLSyclQueueRef, - void * , const size_t, const size_t, - const shape_elem_type *, const shape_elem_type * , - void * , const size_t, const size_t, - const shape_elem_type *, const shape_elem_type * , - void * , const size_t, const size_t, - const shape_elem_type *, const shape_elem_type * , - const c_dpctl.DPCTLEventVectorRef) cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1, utils.dpnp_descriptor in_array2, @@ -288,92 +279,3 @@ cpdef utils.dpnp_descriptor dpnp_kron(dpnp_descriptor in_array1, dpnp_descriptor c_dpctl.DPCTLEvent_Delete(event_ref) return result - - -cpdef utils.dpnp_descriptor dpnp_matmul(utils.dpnp_descriptor in_array1, utils.dpnp_descriptor in_array2, utils.dpnp_descriptor out=None): - - cdef shape_type_c shape_result - - cdef shape_type_c shape1 = in_array1.shape - cdef shape_type_c shape2 = in_array2.shape - - cdef size_t size_m = 0 - cdef size_t size_n = 0 - cdef size_t size_k = 0 - - # Calling this function on an empty container causes undefined behavior. - if not shape1.empty(): - size_m = shape1.front() - if not shape2.empty(): - size_n = shape2.back() - if not shape1.empty(): - size_k = shape1.back() - - cdef size_t ndim_max = max(in_array1.ndim, in_array2.ndim) - - if in_array1.ndim < ndim_max or ndim_max == 1: - """ - shape1(2,), shape2(2,4) - test: pytest tests/test_matmul.py::test_matmul[shape_pair4-types0] -v -s - or - shape1(2,), shape2(2,) - test: pytest tests/test_matmul.py::test_matmul[shape_pair8-types0] -v -s - """ - size_m = 1 - - if in_array2.ndim < ndim_max or ndim_max == 1: - """ - shape1(5,2), shape2(2,) - test: pytest tests/test_matmul.py::test_matmul[shape_pair6-types0] -v -s - or - shape1(3,), shape2(3,) - test: pytest tests/test_matmul.py::test_matmul[shape_pair8-types0] -v -s - """ - size_n = 1 - - shape_result = shape1[:-1] + shape2[-1:] - - # convert string type names (array.dtype) to C enum DPNPFuncType - cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(in_array1.dtype) - cdef DPNPFuncType param2_type = dpnp_dtype_to_DPNPFuncType(in_array2.dtype) - - # get the FPTR data structure - cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_MATMUL_EXT, param1_type, param2_type) - - # create result array with type given by FPTR data - result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(in_array1, in_array2) - cdef utils.dpnp_descriptor result = utils.create_output_descriptor(shape_result, - kernel_data.return_type, - out, - device=result_sycl_device, - usm_type=result_usm_type, - sycl_queue=result_sycl_queue) - if result.size == 0: - return result - - cdef c_dpctl.SyclQueue q = result_sycl_queue - cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref() - - cdef fptr_2in_1out_matmul_t func = kernel_data.ptr - # call FPTR function - cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref, - result.get_data(), - result.size, - result.ndim, - NULL, # result_shape - NULL, # result_strides - in_array1.get_data(), - in_array1.size, - in_array1.ndim, - shape1.data(), - NULL, # in_array1_strides - in_array2.get_data(), - in_array2.size, - in_array2.ndim, - shape2.data(), - NULL, # in_array2_strides - NULL) # dep_event_vec_ref - with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref) - c_dpctl.DPCTLEvent_Delete(event_ref) - - return result diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py index 30b6134da17..d60863dd104 100644 --- a/dpnp/dpnp_iface_linearalgebra.py +++ b/dpnp/dpnp_iface_linearalgebra.py @@ -1,5 +1,3 @@ -# cython: language_level=3 -# distutils: language = c++ # -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2016-2023, Intel Corporation @@ -46,6 +44,7 @@ import dpnp from dpnp.dpnp_algo import * from dpnp.dpnp_utils import * +from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_matmul __all__ = [ "dot", @@ -246,7 +245,17 @@ def kron(x1, x2): return call_origin(numpy.kron, x1, x2) -def matmul(x1, x2, out=None, **kwargs): +def matmul( + x1, + x2, + /, + out=None, + *, + casting="same_kind", + order="K", + dtype=None, + subok=True, +): """ Matrix product of two arrays. @@ -254,9 +263,9 @@ def matmul(x1, x2, out=None, **kwargs): Limitations ----------- - Input arrays are supported as :obj:`dpnp.ndarray`. - Otherwise the function will be executed sequentially on CPU. - Parameter `out` is supported as :obj:`dpnp.ndarray` and as default value ``None``. + Input arrays and parameter `out` are supported as either :class:`dpnp.ndarray` + or :class:`dpctl.tensor.usm_ndarray`. + Keyword argument `subok` is currently unsupported. Input array data types are limited by supported DPNP :ref:`Data types`. See Also @@ -269,63 +278,65 @@ def matmul(x1, x2, out=None, **kwargs): Examples -------- + For 2-D arrays it is the matrix product: + >>> import dpnp as np - >>> a = np.ones([9, 5, 7, 4]) - >>> c = np.ones([9, 5, 4, 3]) - >>> np.matmul(a, c).shape - (9, 5, 7, 3) >>> a = np.array([[1, 0], [0, 1]]) >>> b = np.array([[4, 1], [2, 2]]) >>> np.matmul(a, b) array([[4, 1], [2, 2]]) - """ + For 2-D mixed with 1-D, the result is the usual. - x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False) - x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_nondefault_queue=False) - if x1_desc and x2_desc and not kwargs: - if x1_desc.ndim != 2 or x2_desc.ndim != 2: - pass - elif not x1_desc.ndim: - pass - elif not x2_desc.ndim: - pass - elif not x1_desc.size: - pass - elif not x2_desc.size: - pass - else: - if 0: - """ - Cost model checks - """ - - array1_size = x1_desc.size - array2_size = x2_desc.size - cost_size = 4096 # 2D array shape(64, 64) - - if (x1_desc.dtype == dpnp.float64) or ( - x1_desc.dtype == dpnp.float32 - ): - """ - Floating point types are handled via original math library better than SYCL math library - """ - cost_size = 262144 # 2D array shape(512, 512) - - if (array1_size > cost_size) and (array2_size > cost_size): - return dpnp_matmul(x1_desc, x2_desc, out) - else: - out_desc = ( - dpnp.get_dpnp_descriptor( - out, copy_when_nondefault_queue=False - ) - if out is not None - else None - ) - return dpnp_matmul(x1_desc, x2_desc, out_desc).get_pyobj() + >>> a = np.array([[1, 0], [0, 1]]) + >>> b = np.array([1, 2]) + >>> np.matmul(a, b) + array([1, 2]) + >>> np.matmul(b, a) + array([1, 2]) + + Broadcasting is conventional for stacks of arrays - return call_origin(numpy.matmul, x1, x2, out=out, **kwargs) + >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4)) + >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2)) + >>> np.matmul(a,b).shape + (2, 2, 2) + >>> np.matmul(a, b)[0, 1, 1] + array(98) + >>> np.sum(a[0, 1, :] * b[0 , :, 1]) + array(98) + + Vector, vector returns the scalar inner product, but neither argument is complex-conjugated: + + >>> x1 = np.array([2j, 3j]) + >>> x2 = np.array([2j, 3j]) + >>> np.matmul(x1, x2) + array(-13+0j) + + The ``@`` operator can be used as a shorthand for ``matmul`` on + :class:`dpnp.ndarray`. + + >>> x1 @ x2 + array(-13+0j) + + """ + + dpnp.check_supported_arrays_type(x1) + dpnp.check_supported_arrays_type(x2) + if subok is False: + raise NotImplementedError( + "subok keyword argument is only supported by its default value." + ) + else: + return dpnp_matmul( + x1, + x2, + out=out, + casting=casting, + order=order, + dtype=dtype, + ) def outer(x1, x2, out=None): diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 9efb2aa04f1..dd2fa8f8c2d 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1380,7 +1380,9 @@ def result_type(*arrays_and_dtypes): """ usm_arrays_and_dtypes = [ - X.dtype if isinstance(X, (dpnp_array, dpt.usm_ndarray)) else X + dpnp.get_usm_ndarray(X) + if isinstance(X, (dpnp_array, dpt.usm_ndarray)) + else X for X in arrays_and_dtypes ] return dpt.result_type(*usm_arrays_and_dtypes) diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py new file mode 100644 index 00000000000..d0add55eee3 --- /dev/null +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -0,0 +1,363 @@ +# ***************************************************************************** +# Copyright (c) 2024, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.tensor._tensor_impl as ti +import numpy + +import dpnp +import dpnp.backend.extensions.blas._blas_impl as bi +from dpnp.dpnp_utils import get_usm_allocations + +__all__ = ["dpnp_matmul"] + + +def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue): + """ + Determines the output array data type and the intermediate data type. + + If dtype is ``None``, the output array data type is determined based on + the Promotion Type Rule and device capabilities. Otherwise, `dtype` is + used as output array dtype if input arrays can cast to it according to + the casting rule determined. If casting cannot be done, a ``TypeError`` + is raised. + The intermediate data type is the data type used for performing matmul + operation calculations. If output array dtype is a floating-point data type, + it is also used for the intermediate data type. If output array dtype is an + integral data type, the default floating point data type of the device where + input arrays are allocated on are used for intermediate data type. + + Parameters + ---------- + arrays : {dpnp.ndarray, usm_ndarray} + Input arrays. + dtype : dtype + If not ``None``, data type of the output array. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. + sycl_queue : {SyclQueue} + A SYCL queue to use for determining default floating point datat type. + + Returns + ------- + gemm_dtype, res_dtype : + `gemm_dtype` is the data type used in performing matmul calculations. + The input arrays of matmul function are cast to `gemm_dtype` and then + the calculations are performed. + `res_dtype` is the output data type. When the result is obtained, it is cast + to `res_dtype`. + + """ + + res_dtype = dpnp.result_type(*arrays) + default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue) + + if dtype is not None: + if dpnp.can_cast(res_dtype, dtype, casting=casting): + res_dtype = dtype + else: + raise TypeError( + f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}" + ) + + gemm_dtype = ( + res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype + ) + + return gemm_dtype, res_dtype + + +def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list): + # If input array is F-contiguous, we need to change the order to C-contiguous. + # because mkl::gemm_bacth needs each 2D array to be F-contiguous but + # when the input array is F-contiguous, the data of 2D array + # that needs to be called in mkl::gemm_batch are not contiguous. + ht_tasks_list = [] + x1 = _get_gemm_contig_array(x1, dev_tasks_list, ht_tasks_list) + x2 = _get_gemm_contig_array(x2, dev_tasks_list, ht_tasks_list) + + x1_strides = x1.strides + x2_strides = x2.strides + res_strides = res.strides + + # when shape along any particular dimension is 1, + # the stride along that dimension is not a + # meaningful number and is undefined. Here, we + # standardizing strides before continuing, + # setting stride to 0 if the shape along that axis is <=1 + if x1_is_2D: + x1_strides = tuple( + str_i if sh_i > 1 else 0 + for sh_i, str_i in zip(x1.shape, x1_strides) + ) + if x2_is_2D: + x2_strides = tuple( + str_i if sh_i > 1 else 0 + for sh_i, str_i in zip(x2.shape, x2_strides) + ) + + batch_size = res.shape[:-2][0] + stridea = x1_strides[0] + strideb = x2_strides[0] + stridec = res_strides[-3] + + if x1.ndim > 3: + iter = ti._contract_iter2( + res.shape[:-2], x1_strides[:-2], x2_strides[:-2] + ) + + if len(iter[0]) != 1: + raise ValueError("Input arrays cannot be used in gemm_batch") + batch_size = iter[0][0] + stridea = iter[1][0] + strideb = iter[3][0] + + ht_blas_ev, _ = bi._gemm_batch( + exec_q, + dpnp.get_usm_ndarray(x1), + dpnp.get_usm_ndarray(x2), + dpnp.get_usm_ndarray(res), + batch_size, + stridea, + strideb, + stridec, + dev_tasks_list, + ) + + return ht_blas_ev, ht_tasks_list, res + + +def _get_gemm_contig_array(x, dep_events, host_events, dtype=None): + """ + Creating a copy of input array if needed. + + This function has two use cases. In the first use case, which is more general, + if the input array is not c-contiguous or f-contiguous, we ensure it becomes + c-contiguous. Additionally, if the input array has an integral dtype, we + convert it to an appropriate floating-point data type specified by `dtype`. + In the second use case, which is for N-dimensional arrays with N>2, we need + to ensure c-contiguity. This is crucial because the implementation of the + `gemm_batch` function in dpnp only works for C-contiguous arrays. This use case + is essential when the input array is f-contiguous with floating point dtype for + which the array is not modified in the first use case. + + """ + + if dtype is None: + copy = not x.flags.c_contiguous + else: + copy = ( + not (x.flags.c_contiguous or x.flags.f_contiguous) + or x.dtype != dtype + ) + + if copy: + x_copy = dpnp.empty_like(x, dtype=dtype, order="C") + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=dpnp.get_usm_ndarray(x), + dst=x_copy.get_array(), + sycl_queue=x.sycl_queue, + ) + dep_events.append(copy_ev) + host_events.append(ht_copy_ev) + return x_copy + return x + + +def dpnp_matmul( + x1, + x2, + /, + out=None, + *, + casting="same_kind", + order="K", + dtype=None, +): + """ + dpnp_matmul(x1, x2, out=None, casting="same_kind", order="K", dtype=None) + + Return the matrix product of two arrays. + + The main calculation is done by calling an extension function + for BLAS library of OneMKL. Depending on dimension of `x1` and `x2` arrays, + it will be either ``gemm`` (for one- and two-dimentional arrays) or + ``gemm_batch``(for others). + + """ + + x1_ndim = x1.ndim + x2_ndim = x2.ndim + + if x1_ndim == 0: + raise ValueError( + "input array 0 does not have enough dimensions (has 0, but requires at least 1)" + ) + if x2_ndim == 0: + raise ValueError( + "input array 1 does not have enough dimensions (has 0, but requires at least 1)" + ) + + res_usm_type, exec_q = get_usm_allocations([x1, x2]) + + squeeze_flag = x1_ndim == 1 or x2_ndim == 1 + if x1_ndim == 1: + x1 = x1[dpnp.newaxis, :] + x1_ndim = x1.ndim + + if x2_ndim == 1: + x2 = x2[:, dpnp.newaxis] + x2_ndim = x2.ndim + + x1_shape = x1.shape + x2_shape = x2.shape + if x1_shape[-1] != x2_shape[-2]: + raise ValueError( + "Input arrays have a mismatch in their core dimensions. " + "The core dimensions should follow this signature: (n?,k),(k,m?)->(n?,m?) " + f"(size {x1_shape[-1]} is different from {x2_shape[-2]})" + ) + + # Determine the appropriate data types + gemm_dtype, res_dtype = _gemm_res_dtype( + x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q + ) + + x1_is_2D = x1_ndim == 2 or numpy.prod(x1_shape[:-2]) == 1 # inherently 2D + x2_is_2D = x2_ndim == 2 or numpy.prod(x2_shape[:-2]) == 1 + + # find the result shape + if x1_is_2D and x2_is_2D: + x1 = x1.reshape(x1.shape[-2], x1.shape[-1]) + x2 = x2.reshape(x2.shape[-2], x2.shape[-1]) + res_shape = (x1.shape[-2], x2.shape[-1]) + else: + # makes the dimension of input the same by adding new axis + if x1_ndim != x2_ndim: + diff = abs(x1_ndim - x2_ndim) + if x1_ndim < x2_ndim: + x1 = x1.reshape((1,) * diff + x1.shape) + x1_ndim = x1.ndim + x1_shape = x1.shape + else: + x2 = x2.reshape((1,) * diff + x2.shape) + x2_ndim = x2.ndim + x2_shape = x2.shape + + # examining the option to align inputs + # when their shapes differ but they are 1-D in some dimensions. + tmp_shape = list(x1_shape[:-2]) + for i in range(x1_ndim - 2): + if x1_shape[i] != x2_shape[i]: + if x1_shape[i] == 1: + tmp_shape[i] = x2_shape[i] + # If the `x1` array is inherently 2D, there's no need to + # duplicate the data for the 1-D dimension; + # GEMM handles it automatically. + if not x1_is_2D: + x1 = dpnp.repeat(x1, x2_shape[i], axis=i) + elif x2_shape[i] == 1: + tmp_shape[i] = x1_shape[i] + if not x2_is_2D: + x2 = dpnp.repeat(x2, x1_shape[i], axis=i) + else: + raise ValueError( + "arrays could not be broadcast together with remapped shapes." + ) + x1_shape = x1.shape + x2_shape = x2.shape + res_shape = tuple(tmp_shape) + (x1_shape[-2], x2_shape[-1]) + + # calculate results + result = dpnp.empty( + res_shape, + dtype=gemm_dtype, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + if result.size == 0: + pass + elif x1.size == 0 or x2.size == 0: + result.fill(0) + else: + # input arrays should have the proper data type + # and be C_CONTIGUOUS or F_CONTIGUOUS + dep_events_list = [] + host_tasks_list = [] + x1 = _get_gemm_contig_array( + x1, dep_events_list, host_tasks_list, gemm_dtype + ) + x2 = _get_gemm_contig_array( + x2, dep_events_list, host_tasks_list, gemm_dtype + ) + + if x1_is_2D and x2_is_2D: + ht_blas_ev, _ = bi._gemm( + exec_q, + dpnp.get_usm_ndarray(x1), + dpnp.get_usm_ndarray(x2), + dpnp.get_usm_ndarray(result), + dep_events_list, + ) + else: + ( + ht_blas_ev, + ht_copy_ev, + result, + ) = _gemm_batch_matmul( + exec_q, + x1, + x2, + result, + x1_is_2D, + x2_is_2D, + dep_events_list, + ) + host_tasks_list += ht_copy_ev + + host_tasks_list.append(ht_blas_ev) + dpctl.SyclEvent.wait_for(host_tasks_list) + + if squeeze_flag: + result = dpnp.squeeze(result) + + if x1_is_2D and x2_is_2D: + # add new axes only if one of the input arrays + # was inehrently 2D + new_size = max(x1_ndim, x2_ndim) + for _ in range(new_size - 2): + result = result[dpnp.newaxis, :] + + if gemm_dtype != res_dtype: + result = dpnp.astype(result, res_dtype, copy=False) + if out is None: + # If `order` was not passed as default + # we need to update it to match the passed `order`. + if order not in ["k", "K"]: + return dpnp.array(result, copy=False, order=order) + else: + return result + else: + return dpnp.get_result_array(result, out, casting=casting) diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py index 7ed82953541..2d15283a5bc 100644 --- a/dpnp/dpnp_utils/dpnp_utils_statistics.py +++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py @@ -1,6 +1,3 @@ -# cython: language_level=3 -# distutils: language = c++ -# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2023, Intel Corporation # All rights reserved. diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index b7a56c4c7db..e07a57beebe 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -1,6 +1,3 @@ -# cython: language_level=3 -# distutils: language = c++ -# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2016-2023, Intel Corporation # All rights reserved. diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index bd44b974253..c6131541b6e 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1,6 +1,3 @@ -# cython: language_level=3 -# distutils: language = c++ -# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2023, Intel Corporation # All rights reserved. diff --git a/tests/test_manipulation.py b/tests/test_manipulation.py index 576229b1d56..bb5533b0e62 100644 --- a/tests/test_manipulation.py +++ b/tests/test_manipulation.py @@ -8,6 +8,7 @@ get_all_dtypes, get_complex_dtypes, get_float_dtypes, + has_support_aspect64, ) testdata = [] @@ -71,11 +72,17 @@ def test_repeat(arr): assert_array_equal(expected, result) +# TODO: Temporary skipping the test, until Internal CI is updated with +# recent changed in dpctl regarding dpt.result_type function +@pytest.mark.skip("Temporary skipping the test") def test_result_type(): - X = [dpnp.ones((2), dtype=dpnp.int64), dpnp.int32, "float16"] - X_np = [numpy.ones((2), dtype=numpy.int64), numpy.int32, "float16"] + X = [dpnp.ones((2), dtype=dpnp.int64), dpnp.int32, "float32"] + X_np = [numpy.ones((2), dtype=numpy.int64), numpy.int32, "float32"] - assert dpnp.result_type(*X) == numpy.result_type(*X_np) + if has_support_aspect64(): + assert dpnp.result_type(*X) == numpy.result_type(*X_np) + else: + assert dpnp.result_type(*X) == dpnp.default_float_type(X[0].device) def test_result_type_only_dtypes(): diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py index c521da5e0cb..6c95093a01b 100644 --- a/tests/test_mathematical.py +++ b/tests/test_mathematical.py @@ -1,5 +1,6 @@ from itertools import permutations +import dpctl import dpctl.tensor as dpt import numpy import pytest @@ -2389,3 +2390,367 @@ def test_inplace_remainder(dtype): dp_a %= 4 assert_allclose(dp_a, np_a) + + +@pytest.mark.parametrize( + "dtype", get_all_dtypes(no_bool=True, no_none=True, no_complex=True) +) +def test_inplace_floor_divide(dtype): + size = 21 + np_a = numpy.arange(size, dtype=dtype) + dp_a = dpnp.arange(size, dtype=dtype) + + np_a //= 4 + dp_a //= 4 + + assert_allclose(dp_a, np_a) + + +class TestMatmul: + @pytest.mark.parametrize( + "order_pair", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")] + ) + @pytest.mark.parametrize( + "shape_pair", + [ + ((4,), (4,)), + ((4,), (4, 2)), + ((2, 4), (4,)), + ((2, 4), (4, 3)), + ((1, 2, 3), (1, 3, 5)), + ((4, 2, 3), (4, 3, 5)), + ((1, 2, 3), (4, 3, 5)), + ((2, 3), (4, 3, 5)), + ((4, 2, 3), (1, 3, 5)), + ((4, 2, 3), (3, 5)), + ((1, 1, 4, 3), (1, 1, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ((6, 7, 4, 3), (1, 1, 3, 5)), + ((6, 7, 4, 3), (1, 3, 5)), + ((6, 7, 4, 3), (3, 5)), + ((6, 7, 4, 3), (1, 7, 3, 5)), + ((6, 7, 4, 3), (7, 3, 5)), + ((6, 7, 4, 3), (6, 1, 3, 5)), + ((1, 1, 4, 3), (6, 7, 3, 5)), + ((1, 4, 3), (6, 7, 3, 5)), + ((4, 3), (6, 7, 3, 5)), + ((6, 1, 4, 3), (6, 7, 3, 5)), + ((1, 7, 4, 3), (6, 7, 3, 5)), + ((7, 4, 3), (6, 7, 3, 5)), + ((1, 5, 3, 2), (6, 5, 2, 4)), + ((5, 3, 2), (6, 5, 2, 4)), + ((1, 3, 3), (10, 1, 3, 1)), + ], + ) + def test_matmul(self, order_pair, shape_pair): + order1, order2 = order_pair + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2) + a1 = numpy.array(a1, order=order1) + a2 = numpy.array(a2, order=order2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2) + expected = numpy.matmul(a1, a2) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize( + "order_pair", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")] + ) + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 0), (0, 3)), + ((0, 4), (4, 3)), + ((2, 4), (4, 0)), + ((1, 2, 3), (0, 3, 5)), + ((0, 2, 3), (1, 3, 5)), + ((2, 3), (0, 3, 5)), + ((0, 2, 3), (3, 5)), + ((0, 0, 4, 3), (1, 1, 3, 5)), + ((6, 0, 4, 3), (1, 3, 5)), + ((0, 7, 4, 3), (3, 5)), + ((0, 7, 4, 3), (1, 7, 3, 5)), + ((0, 7, 4, 3), (7, 3, 5)), + ((6, 0, 4, 3), (6, 1, 3, 5)), + ((1, 1, 4, 3), (0, 0, 3, 5)), + ((1, 4, 3), (6, 0, 3, 5)), + ((4, 3), (0, 0, 3, 5)), + ((6, 1, 4, 3), (6, 0, 3, 5)), + ((1, 7, 4, 3), (0, 7, 3, 5)), + ((7, 4, 3), (0, 7, 3, 5)), + ], + ) + def test_matmul_empty(self, order_pair, shape_pair): + order1, order2 = order_pair + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2) + a1 = numpy.array(a1, order=order1) + a2 = numpy.array(a2, order=order2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2) + expected = numpy.matmul(a1, a2) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], + ) + def test_matmul_bool(self, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.resize( + numpy.arange(2, dtype=numpy.bool_), numpy.prod(shape1) + ).reshape(shape1) + a2 = numpy.resize( + numpy.arange(2, dtype=numpy.bool_), numpy.prod(shape2) + ).reshape(shape2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2) + expected = numpy.matmul(a1, a2) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], + ) + def test_matmul_dtype(self, dtype, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2, dtype=dtype) + expected = numpy.matmul(a1, a2, dtype=dtype) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "dtype2", get_all_dtypes(no_bool=True, no_none=True) + ) + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], + ) + def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2), dtype=dtype1).reshape(shape2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + if dpnp.can_cast(dpnp.result_type(b1, b2), dtype2, casting="same_kind"): + result = dpnp.matmul(b1, b2, dtype=dtype2) + expected = numpy.matmul(a1, a2, dtype=dtype2) + assert_dtype_allclose(result, expected) + else: + with pytest.raises(TypeError): + dpnp.matmul(b1, b2, dtype=dtype2) + + # TODO: Temporary skipping the test, until Internal CI is updated with + # recent changed in dpctl regarding dpt.result_type function + @pytest.mark.skip("Temporary skipping the test") + @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize("dtype2", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], + ) + def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2), dtype=dtype2).reshape(shape2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2) + expected = numpy.matmul(a1, a2) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("order", ["C", "F", "K", "A"]) + @pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], + ) + def test_matmul_order(self, order, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + result = dpnp.matmul(b1, b2, order=order) + expected = numpy.matmul(a1, a2, order=order) + assert result.flags.c_contiguous == expected.flags.c_contiguous + assert result.flags.f_contiguous == expected.flags.f_contiguous + assert_dtype_allclose(result, expected) + + def test_matmul_strided(self): + for dim in [1, 2, 3, 4]: + A = numpy.random.rand(*([20] * dim)) + B = dpnp.asarray(A) + # positive strides + slices = tuple(slice(None, None, 2) for _ in range(dim)) + a = A[slices] + b = B[slices] + + result = dpnp.matmul(b, b) + expected = numpy.matmul(a, a) + assert_dtype_allclose(result, expected) + + # negative strides + slices = tuple(slice(None, None, -2) for _ in range(dim)) + a = A[slices] + b = B[slices] + + result = dpnp.matmul(b, b) + expected = numpy.matmul(a, a) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize( + "dtype", get_all_dtypes(no_none=True, no_bool=True) + ) + def test_matmul_out(self, dtype): + a1 = numpy.arange(5 * 4, dtype=dtype).reshape(5, 4) + a2 = numpy.arange(7 * 4, dtype=dtype).reshape(4, 7) + + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + dpnp_out = dpnp.empty((5, 7), dtype=dtype) + result = dpnp.matmul(b1, b2, out=dpnp_out) + expected = numpy.matmul(a1, a2) + assert result is dpnp_out + assert_dtype_allclose(result, expected) + + +class TestMatmulInvalidCases: + @pytest.mark.parametrize( + "shape_pair", + [ + ((3, 2), ()), + ((), (3, 2)), + ((), ()), + ], + ) + def test_zero_dim(self, shape_pair): + for xp in (numpy, dpnp): + shape1, shape2 = shape_pair + x1 = xp.arange(numpy.prod(shape1), dtype=xp.float32).reshape(shape1) + x2 = xp.arange(numpy.prod(shape2), dtype=xp.float32).reshape(shape2) + with pytest.raises(ValueError): + xp.matmul(x1, x2) + + @pytest.mark.parametrize( + "shape_pair", + [ + ((5, 3, 1), (3, 1, 4)), + ((3, 2, 3), (3, 2, 4)), + ((3, 2), (1,)), + ((1, 2), (3, 1)), + ((4, 3, 2), (6, 5, 2, 4)), + ((6, 5, 3, 2), (3, 2, 4)), + ], + ) + def test_invalid_shape(self, shape_pair): + for xp in (numpy, dpnp): + shape1, shape2 = shape_pair + x1 = xp.arange(numpy.prod(shape1), dtype=xp.float32).reshape(shape1) + x2 = xp.arange(numpy.prod(shape2), dtype=xp.float32).reshape(shape2) + with pytest.raises(ValueError): + xp.matmul(x1, x2) + + @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True)[:-2]) + def test_invalid_dtype(self, dtype): + dpnp_dtype = get_all_dtypes(no_none=True)[-1] + a1 = dpnp.arange(5 * 4, dtype=dpnp_dtype).reshape(5, 4) + a2 = dpnp.arange(7 * 4, dtype=dpnp_dtype).reshape(4, 7) + dp_out = dpnp.empty((5, 7), dtype=dtype) + + with pytest.raises(TypeError): + dpnp.matmul(a1, a2, out=dp_out) + + def test_exe_q(self): + x1 = dpnp.ones((5, 4), sycl_queue=dpctl.SyclQueue()) + x2 = dpnp.ones((4, 7), sycl_queue=dpctl.SyclQueue()) + + with pytest.raises(ValueError): + dpnp.matmul(x1, x2) + + # TODO: Temporary skipping the test, until Internal CI is updated with + # recent changed in dpctl regarding dpt.result_type function + @pytest.mark.skip("Temporary skipping the test") + def test_matmul_casting(self): + a1 = dpnp.arange(2 * 4, dtype=dpnp.float32).reshape(2, 4) + a2 = dpnp.arange(4 * 3).reshape(4, 3) + + res = dpnp.empty((2, 3), dtype=dpnp.int64) + with pytest.raises(TypeError): + dpnp.matmul(a1, a2, out=res, casting="safe") + + def test_matmul_subok(self): + a1 = dpnp.arange(2 * 4).reshape(2, 4) + a2 = dpnp.arange(4 * 3).reshape(4, 3) + + with pytest.raises(NotImplementedError): + dpnp.matmul(a1, a2, subok=False) diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py index de762ecef93..fb94f1cd4cb 100644 --- a/tests/test_sycl_queue.py +++ b/tests/test_sycl_queue.py @@ -666,6 +666,41 @@ def test_2in_1out_diff_queue_but_equal_context(func, device): getattr(dpnp, func)(x1, x2) +@pytest.mark.parametrize( + "device", + valid_devices, + ids=[device.filter_string for device in valid_devices], +) +@pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], +) +def test_matmul(device, shape_pair): + shape1, shape2 = shape_pair + a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1) + a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2) + + b1 = dpnp.asarray(a1, device=device) + b2 = dpnp.asarray(a2, device=device) + + result = dpnp.matmul(b1, b2) + expected = numpy.matmul(a1, a2) + assert_allclose(expected, result) + + result_queue = result.sycl_queue + assert_sycl_queue_equal(result_queue, b1.sycl_queue) + assert_sycl_queue_equal(result_queue, b2.sycl_queue) + + @pytest.mark.parametrize( "func, kwargs", [ diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py index 6b46dcceb1e..0b094920c01 100644 --- a/tests/test_usm_type.py +++ b/tests/test_usm_type.py @@ -352,6 +352,39 @@ def test_coerced_usm_types_bitwise_op(op, usm_type_x, usm_type_y): assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) +@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types) +@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types) +@pytest.mark.parametrize( + "shape_pair", + [ + ((2, 4), (4, 3)), + ((2, 0), (0, 3)), + ((2, 4), (4, 0)), + ((4, 2, 3), (4, 3, 5)), + ((6, 7, 4, 3), (6, 7, 3, 5)), + ], + ids=[ + "((2, 4), (4, 3))", + "((2, 0), (0, 3))", + "((2, 4), (4, 0))", + "((4, 2, 3), (4, 3, 5))", + "((6, 7, 4, 3), (6, 7, 3, 5))", + ], +) +def test_matmul(usm_type_x, usm_type_y, shape_pair): + shape1, shape2 = shape_pair + x = numpy.arange(numpy.prod(shape1)).reshape(shape1) + y = numpy.arange(numpy.prod(shape2)).reshape(shape2) + + x = dp.array(x, usm_type=usm_type_x) + y = dp.array(y, usm_type=usm_type_y) + z = dp.matmul(x, y) + + assert x.usm_type == usm_type_x + assert y.usm_type == usm_type_y + assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y]) + + @pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types) @pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types) def test_meshgrid(usm_type_x, usm_type_y): diff --git a/tests/third_party/cupy/math_tests/test_matmul.py b/tests/third_party/cupy/math_tests/test_matmul.py index d0f3555373a..d21ec7a2d68 100644 --- a/tests/third_party/cupy/math_tests/test_matmul.py +++ b/tests/third_party/cupy/math_tests/test_matmul.py @@ -25,40 +25,38 @@ ((0,), (0,)), # matmul test ((5, 3, 2), (5, 2, 4)), - # ((0, 3, 2), (0, 2, 4)), - # ((5, 3, 2), (2, 4)), - # ((0, 3, 2), (2, 4)), - # ((3, 2), (5, 2, 4)), - # ((3, 2), (0, 2, 4)), - # ((5, 3, 2), (1, 2, 4)), - # ((0, 3, 2), (1, 2, 4)), - # ((1, 3, 2), (5, 2, 4)), - # ((1, 3, 2), (0, 2, 4)), - # ((5, 3, 2), (2,)), - # ((5, 3, 0), (0,)), - # ((2,), (5, 2, 4)), - # ((0,), (5, 0, 4)), - # ((2, 2, 3, 2), (2, 2, 2, 4)), - # ((5, 0, 3, 2), (5, 0, 2, 4)), - # ((6, 5, 3, 2), (2, 4)), - # ((5, 0, 3, 2), (2, 4)), - # ((3, 2), (6, 5, 2, 4)), - # ((3, 2), (5, 0, 2, 4)), - # ((1, 5, 3, 2), (6, 1, 2, 4)), - # ((1, 0, 3, 2), (6, 1, 2, 4)), - # ((6, 1, 3, 2), (1, 5, 2, 4)), - # ((6, 1, 3, 2), (1, 0, 2, 4)), - # ((6, 5, 3, 2), (2,)), - # ((6, 5, 3, 0), (0,)), - # ((2,), (6, 5, 2, 4)), - # ((0,), (6, 5, 0, 4)), + ((0, 3, 2), (0, 2, 4)), + ((5, 3, 2), (2, 4)), + ((0, 3, 2), (2, 4)), + ((3, 2), (5, 2, 4)), + ((3, 2), (0, 2, 4)), + ((5, 3, 2), (1, 2, 4)), + ((0, 3, 2), (1, 2, 4)), + ((1, 3, 2), (5, 2, 4)), + ((1, 3, 2), (0, 2, 4)), + ((5, 3, 2), (2,)), + ((5, 3, 0), (0,)), + ((2,), (5, 2, 4)), + ((0,), (5, 0, 4)), + ((2, 2, 3, 2), (2, 2, 2, 4)), + ((5, 0, 3, 2), (5, 0, 2, 4)), + ((6, 5, 3, 2), (2, 4)), + ((5, 0, 3, 2), (2, 4)), + ((3, 2), (6, 5, 2, 4)), + ((3, 2), (5, 0, 2, 4)), + ((1, 5, 3, 2), (6, 1, 2, 4)), + ((1, 0, 3, 2), (6, 1, 2, 4)), + ((6, 1, 3, 2), (1, 5, 2, 4)), + ((6, 1, 3, 2), (1, 0, 2, 4)), + ((6, 5, 3, 2), (2,)), + ((6, 5, 3, 0), (0,)), + ((2,), (6, 5, 2, 4)), + ((0,), (6, 5, 0, 4)), ((1, 3, 3), (10, 1, 3, 1)), ], } ) ) -@pytest.mark.usefixtures("allow_fall_back_on_numpy") -@testing.gpu class TestMatmul(unittest.TestCase): @testing.for_all_dtypes(name="dtype1") @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3) # required for uint8 @@ -94,8 +92,6 @@ def test_cupy_matmul(self, xp, dtype1): } ) ) -@pytest.mark.usefixtures("allow_fall_back_on_numpy") -@testing.gpu class TestMatmulLarge(unittest.TestCase): # Avoid overflow skip_dtypes = { @@ -151,8 +147,6 @@ def test_cupy_matmul(self, xp, dtype1): } ) ) -@pytest.mark.usefixtures("allow_fall_back_on_numpy") -@testing.gpu class TestMatmulInvalidShape(unittest.TestCase): def test_invalid_shape(self): for xp in (numpy, dpnp):