diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt
index ebaf1d7b0ef..dadfb9d476e 100644
--- a/dpnp/CMakeLists.txt
+++ b/dpnp/CMakeLists.txt
@@ -56,6 +56,7 @@ endfunction()
 
 build_dpnp_cython_ext_with_backend(dparray ${CMAKE_CURRENT_SOURCE_DIR}/dparray.pyx dpnp)
 add_subdirectory(backend)
+add_subdirectory(backend/extensions/blas)
 add_subdirectory(backend/extensions/lapack)
 add_subdirectory(backend/extensions/vm)
 add_subdirectory(backend/extensions/sycl_ext)
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
new file mode 100644
index 00000000000..d19f60c9792
--- /dev/null
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -0,0 +1,83 @@
+# *****************************************************************************
+# Copyright (c) 2016-2023, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+set(python_module_name _blas_impl)
+set(_module_src
+    ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp
+)
+
+pybind11_add_module(${python_module_name} MODULE ${_module_src})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+
+if (WIN32)
+    if (${CMAKE_VERSION} VERSION_LESS "3.27")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel")
+    endif()
+endif()
+
+set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
+
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
+
+if (WIN32)
+  target_compile_options(${python_module_name} PRIVATE
+    /clang:-fno-approx-func
+    /clang:-fno-finite-math-only
+    )
+else()
+  target_compile_options(${python_module_name} PRIVATE
+    -fno-approx-func
+    -fno-finite-math-only
+    )
+endif()
+
+target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
+if (UNIX)
+    # this option is support on Linux only
+    target_link_options(${python_module_name} PUBLIC -fsycl-link-huge-device-code)
+endif()
+
+if (DPNP_GENERATE_COVERAGE)
+    target_link_options(${python_module_name} PRIVATE -fprofile-instr-generate -fcoverage-mapping)
+endif()
+
+if (MKL_VERSION_2024)
+    target_link_libraries(${python_module_name} PUBLIC MKL::MKL_SYCL::BLAS)
+else()
+    target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP)
+endif()
+
+install(TARGETS ${python_module_name}
+  DESTINATION "dpnp/backend/extensions/blas"
+)
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
new file mode 100644
index 00000000000..524f16fcc7d
--- /dev/null
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -0,0 +1,66 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+// This file defines functions of dpnp.backend._lapack_impl extensions
+//
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "gemm.hpp"
+
+namespace blas_ext = dpnp::backend::ext::blas;
+namespace py = pybind11;
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+    blas_ext::init_gemm_batch_dispatch_table();
+    blas_ext::init_gemm_dispatch_table();
+}
+
+PYBIND11_MODULE(_blas_impl, m)
+{
+    init_dispatch_tables();
+
+    {
+        m.def("_gemm", &blas_ext::gemm,
+              "Call `gemm` from OneMKL LAPACK library to return "
+              "the matrix-matrix product with 2-D matrices.",
+              py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
+    {
+        m.def("_gemm_batch", &blas_ext::gemm_batch,
+              "Call `gemm_batch` from OneMKL LAPACK library to return "
+              "the matrix-matrix product for a batch of 2-D matrices.",
+              py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),
+              py::arg("result"), py::arg("batch_size"), py::arg("stridea"),
+              py::arg("strideb"), py::arg("stridec"),
+              py::arg("depends") = py::list());
+    }
+}
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
new file mode 100644
index 00000000000..5526ecd3c1b
--- /dev/null
+++ b/dpnp/backend/extensions/blas/gemm.cpp
@@ -0,0 +1,263 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "gemm.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*gemm_impl_fn_ptr_t)(sycl::queue,
+                                          oneapi::mkl::transpose,
+                                          oneapi::mkl::transpose,
+                                          const std::int64_t,
+                                          const std::int64_t,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          const std::vector<sycl::event> &);
+
+static gemm_impl_fn_ptr_t gemm_dispatch_table[dpctl_td_ns::num_types]
+                                             [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event gemm_impl(sycl::queue exec_q,
+                             oneapi::mkl::transpose transA,
+                             oneapi::mkl::transpose transB,
+                             const std::int64_t m,
+                             const std::int64_t n,
+                             const std::int64_t k,
+                             char *matrixA,
+                             const std::int64_t lda,
+                             char *matrixB,
+                             const std::int64_t ldb,
+                             char *resultC,
+                             const std::int64_t ldc,
+                             const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(matrixA);
+    Tab *b = reinterpret_cast<Tab *>(matrixB);
+    Tc *res = reinterpret_cast<Tc *>(resultC);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event gemm_event;
+    try {
+        gemm_event = mkl_blas::row_major::gemm(
+            exec_q,
+            transA, // Defines the transpose operation for matrix A:
+                    // 'N' indicates no transpose, 'T' for transpose,
+                    // or 'C' for a conjugate transpose.
+            transB, // Same as transA but for matrix B.
+            m,      // Number of rows in matrices A and C.
+            n,      // Number of columns in matrices B and C.
+            k,      // Number of columns in matrix A and rows in matrix B.
+            Tab(1), // Scaling factor for the product of matrices A and B.
+            a,      // Pointer to matrix A.
+            lda,    // Leading dimension of matrix A, which is the
+                    // stride between successive rows (for row major
+                    // layout).
+            b,      // Pointer to matrix B.
+            ldb,    // Leading dimension of matrix B, similar to lda.
+            Tab(0), // Scaling factor for matrix C.
+            res,    // Pointer to matrix C, where the result is stored.
+            ldc,    // Leading dimension of matrix C.
+            depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during gemm() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during gemm() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return gemm_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    gemm(sycl::queue exec_q,
+         dpctl::tensor::usm_ndarray matrixA,
+         dpctl::tensor::usm_ndarray matrixB,
+         dpctl::tensor::usm_ndarray resultC,
+         const std::vector<sycl::event> &depends)
+{
+    const int matrixA_nd = matrixA.get_ndim();
+    const int matrixB_nd = matrixB.get_ndim();
+    const int resultC_nd = resultC.get_ndim();
+
+    if ((matrixA_nd != 2) || (matrixB_nd != 2) || (resultC_nd != 2)) {
+        throw py::value_error("The input matrices must be of 2 dimensions.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(matrixA, resultC)) {
+        throw py::value_error("Input array 1 and output array are overlapping "
+                              "segments of memory");
+    }
+    if (overlap(matrixB, resultC)) {
+        throw py::value_error("Input array 2 and output array are overlapping "
+                              "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    bool is_matrixA_f_contig = matrixA.is_f_contiguous();
+    bool is_matrixB_f_contig = matrixB.is_f_contiguous();
+    bool is_matrixA_c_contig = matrixA.is_c_contiguous();
+    bool is_matrixB_c_contig = matrixB.is_c_contiguous();
+
+    if (!is_matrixA_f_contig and !is_matrixA_c_contig) {
+        throw py::value_error(
+            "Input array 1 is not c-contiguous nor f-contiguous.");
+    }
+    if (!is_matrixB_f_contig and !is_matrixB_c_contig) {
+        throw py::value_error(
+            "Input array 2 is not c-contiguous nor f-contiguous.");
+    }
+
+    const py::ssize_t *a_shape = matrixA.get_shape_raw();
+    const py::ssize_t *b_shape = matrixB.get_shape_raw();
+    const py::ssize_t *res_shape = resultC.get_shape_raw();
+
+    if (a_shape[1] != b_shape[0]) {
+        throw py::value_error("The number of columns in A must be equal to "
+                              "the number of rows in B.");
+    }
+
+    oneapi::mkl::transpose transA = is_matrixA_f_contig
+                                        ? oneapi::mkl::transpose::T
+                                        : oneapi::mkl::transpose::N;
+    oneapi::mkl::transpose transB = is_matrixB_f_contig
+                                        ? oneapi::mkl::transpose::T
+                                        : oneapi::mkl::transpose::N;
+
+    const std::int64_t m = a_shape[0];
+    const std::int64_t n = b_shape[1];
+    const std::int64_t k = a_shape[1];
+
+    const std::int64_t lda =
+        (transA == oneapi::mkl::transpose::N) ? a_shape[1] : a_shape[0];
+    const std::int64_t ldb =
+        (transB == oneapi::mkl::transpose::N) ? b_shape[1] : b_shape[0];
+    const std::int64_t ldc = res_shape[1];
+
+    int matrixA_typenum = matrixA.get_typenum();
+    int matrixB_typenum = matrixB.get_typenum();
+    int resultC_typenum = resultC.get_typenum();
+
+    if (matrixA_typenum != matrixB_typenum) {
+        throw py::value_error("matrixA and matrixB must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int matrixAB_type_id = array_types.typenum_to_lookup_id(matrixA_typenum);
+    int resultC_type_id = array_types.typenum_to_lookup_id(resultC_typenum);
+
+    gemm_impl_fn_ptr_t gemm_fn =
+        gemm_dispatch_table[matrixAB_type_id][resultC_type_id];
+    if (gemm_fn == nullptr) {
+        throw py::value_error(
+            "Types of input matrices and result matrix are mismatched.");
+    }
+
+    char *a_typeless_ptr = matrixA.get_data();
+    char *b_typeless_ptr = matrixB.get_data();
+    char *r_typeless_ptr = resultC.get_data();
+
+    sycl::event gemm_ev =
+        gemm_fn(exec_q, transA, transB, m, n, k, a_typeless_ptr, lda,
+                b_typeless_ptr, ldb, r_typeless_ptr, ldc, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {matrixA, matrixB, resultC}, {gemm_ev});
+
+    return std::make_pair(args_ev, gemm_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct GemmContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GemmTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return gemm_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_gemm_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(gemm_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
new file mode 100644
index 00000000000..25f78b5b850
--- /dev/null
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -0,0 +1,64 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+extern std::pair<sycl::event, sycl::event>
+    gemm(sycl::queue exec_q,
+         dpctl::tensor::usm_ndarray matrixA,
+         dpctl::tensor::usm_ndarray matrixB,
+         dpctl::tensor::usm_ndarray resultC,
+         const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    gemm_batch(sycl::queue exec_q,
+               dpctl::tensor::usm_ndarray matrixA,
+               dpctl::tensor::usm_ndarray matrixB,
+               dpctl::tensor::usm_ndarray resultC,
+               const std::int64_t batch_size,
+               size_t stridea,
+               size_t strideb,
+               size_t stridec,
+               const std::vector<sycl::event> &depends);
+
+extern void init_gemm_dispatch_table(void);
+extern void init_gemm_batch_dispatch_table(void);
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
new file mode 100644
index 00000000000..32f592f6b8a
--- /dev/null
+++ b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -0,0 +1,253 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "gemm.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*gemm_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    const std::int64_t,
+    size_t,
+    size_t,
+    size_t,
+    oneapi::mkl::transpose,
+    oneapi::mkl::transpose,
+    char *,
+    char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+static gemm_batch_impl_fn_ptr_t
+    gemm_batch_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event gemm_batch_impl(sycl::queue exec_q,
+                                   const std::int64_t m,
+                                   const std::int64_t n,
+                                   const std::int64_t k,
+                                   const std::int64_t batch_size,
+                                   const std::int64_t lda,
+                                   const std::int64_t ldb,
+                                   const std::int64_t ld_result,
+                                   size_t stridea,
+                                   size_t strideb,
+                                   size_t stridec,
+                                   oneapi::mkl::transpose transA,
+                                   oneapi::mkl::transpose transB,
+                                   char *matrixA,
+                                   char *matrixB,
+                                   char *resultC,
+                                   const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(matrixA);
+    Tab *b = reinterpret_cast<Tab *>(matrixB);
+    Tc *res = reinterpret_cast<Tc *>(resultC);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event gemm_batch_event;
+    try {
+        gemm_batch_event = mkl_blas::row_major::gemm_batch(
+            exec_q,
+            transA,     // Defines the transpose operation for matrix A:
+                        // 'N' indicates no transpose, 'T' for transpose,
+                        // or 'C' for a conjugate transpose.
+            transB,     // Same as transA but for matrix B.
+            m,          // Number of rows in matrices A and C.
+            n,          // Number of columns in matrices B and C.
+            k,          // Number of columns in matrix A and rows in matrix B.
+            Tab(1),     // Scaling factor for the product of matrices A and B.
+            a,          // Pointer to matrix A.
+            lda,        // Leading dimension of matrix A, which is the
+                        // stride between successive rows (for row major
+                        // layout).
+            stridea,    // Stride between different A matrices.
+            b,          // Pointer to matrix B.
+            ldb,        // Leading dimension of matrix B, similar to lda.
+            strideb,    // Stride between different B matrices.
+            Tab(0),     // Scaling factor for matrix C.
+            res,        // Pointer to matrix C, where the result is stored.
+            ld_result,  // Leading dimension of matrix C.
+            stridec,    // Stride between different C matrices.
+            batch_size, // Specifies the number of matrix multiply operations to
+                        // perform.
+            depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg << "Unexpected MKL exception caught during gemm_batch() "
+                     "call:\nreason: "
+                  << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg
+            << "Unexpected SYCL exception caught during gemm_batch() call:\n"
+            << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return gemm_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    gemm_batch(sycl::queue exec_q,
+               dpctl::tensor::usm_ndarray matrixA,
+               dpctl::tensor::usm_ndarray matrixB,
+               dpctl::tensor::usm_ndarray resultC,
+               const std::int64_t batch_size,
+               size_t stridea,
+               size_t strideb,
+               size_t stridec,
+               const std::vector<sycl::event> &depends = {})
+{
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(matrixA, resultC)) {
+        throw py::value_error("Input array 1 and output array are overlapping "
+                              "segments of memory");
+    }
+    if (overlap(matrixB, resultC)) {
+        throw py::value_error("Input array 2 and output array are overlapping "
+                              "segments of memory");
+    }
+
+    const int matrixA_nd = matrixA.get_ndim();
+    const int matrixB_nd = matrixB.get_ndim();
+    const py::ssize_t *a_shape = matrixA.get_shape_raw();
+    const py::ssize_t *b_shape = matrixB.get_shape_raw();
+
+    if (a_shape[matrixA_nd - 1] != b_shape[matrixB_nd - 2]) {
+        throw py::value_error("The number of columns in A must be equal to "
+                              "the number of rows in B.");
+    }
+
+    const std::int64_t m = a_shape[matrixA_nd - 2];
+    const std::int64_t n = b_shape[matrixB_nd - 1];
+    const std::int64_t k = a_shape[matrixA_nd - 1];
+
+    // transA and transB are always True
+    oneapi::mkl::transpose transA = oneapi::mkl::transpose::N;
+    oneapi::mkl::transpose transB = oneapi::mkl::transpose::N;
+
+    int matrixA_typenum = matrixA.get_typenum();
+    int matrixB_typenum = matrixB.get_typenum();
+    int resultC_typenum = resultC.get_typenum();
+
+    if (matrixA_typenum != matrixB_typenum) {
+        throw py::value_error("matrixA and matrixB must be of the same type.");
+    }
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int matrixAB_type_id = array_types.typenum_to_lookup_id(matrixA_typenum);
+    int resultC_type_id = array_types.typenum_to_lookup_id(resultC_typenum);
+
+    gemm_batch_impl_fn_ptr_t gemm_batch_fn =
+        gemm_batch_dispatch_table[matrixAB_type_id][resultC_type_id];
+    if (gemm_batch_fn == nullptr) {
+        throw py::value_error(
+            "Types of input matrices and result matrix are mismatched.");
+    }
+
+    char *a_typeless_ptr = matrixA.get_data();
+    char *b_typeless_ptr = matrixB.get_data();
+    char *r_typeless_ptr = resultC.get_data();
+
+    // Note that lda = k, ldb = n, and ld_result = n
+    sycl::event gemm_batch_ev = gemm_batch_fn(
+        exec_q, m, n, k, batch_size, k, n, n, stridea, strideb, stridec, transA,
+        transB, a_typeless_ptr, b_typeless_ptr, r_typeless_ptr, depends);
+
+    sycl::event args_batch_ev = dpctl::utils::keep_args_alive(
+        exec_q, {matrixA, matrixB, resultC}, {gemm_batch_ev});
+
+    return std::make_pair(args_batch_ev, gemm_batch_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct GemmBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GemmBatchTypePairSupportFactory<Tab,
+                                                             Tc>::is_defined) {
+            return gemm_batch_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_gemm_batch_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t,
+                                      GemmBatchContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(gemm_batch_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
new file mode 100644
index 00000000000..49154df03c4
--- /dev/null
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -0,0 +1,109 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <type_traits>
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
+// dpctl namespace for operations with types
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace types
+{
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::gemm<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input matrices A and B.
+ * @tparam Tc Type of array containing output matrix C.
+ */
+template <typename Tab, typename Tc>
+struct GemmTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab, std::int8_t, Tc, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, std::int8_t, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, sycl::half, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, sycl::half, Tc, sycl::half>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in
+ * oneapi::mkl::blas::gemm_batch<Tab, Tc> function.
+ *
+ * @tparam Tab Type of arrays containing input matrices A and B.
+ * @tparam Tc Type of array containing output matrix C.
+ */
+template <typename Tab, typename Tc>
+struct GemmBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab, std::int8_t, Tc, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, std::int8_t, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, sycl::half, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, sycl::half, Tc, sycl::half>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+} // namespace types
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 58e3fc10634..3e3f010fa63 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -190,8 +190,6 @@ enum class DPNPFuncName : size_t
     DPNP_FN_LOG2,            /**< Used in numpy.log2() impl  */
     DPNP_FN_LOG1P,           /**< Used in numpy.log1p() impl  */
     DPNP_FN_MATMUL,          /**< Used in numpy.matmul() impl  */
-    DPNP_FN_MATMUL_EXT,      /**< Used in numpy.matmul() impl, requires extra
-                                parameters */
     DPNP_FN_MATRIX_RANK,     /**< Used in numpy.linalg.matrix_rank() impl  */
     DPNP_FN_MATRIX_RANK_EXT, /**< Used in numpy.linalg.matrix_rank() impl,
                                 requires extra parameters */
diff --git a/dpnp/backend/kernels/dpnp_krnl_common.cpp b/dpnp/backend/kernels/dpnp_krnl_common.cpp
index d575f8bdb96..d8b1038ea74 100644
--- a/dpnp/backend/kernels/dpnp_krnl_common.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_common.cpp
@@ -946,26 +946,6 @@ void (*dpnp_matmul_default_c)(void *,
                               const shape_elem_type *) =
     dpnp_matmul_c<_DataType>;
 
-template <typename _DataType>
-DPCTLSyclEventRef (*dpnp_matmul_ext_c)(DPCTLSyclQueueRef,
-                                       void *,
-                                       const size_t,
-                                       const size_t,
-                                       const shape_elem_type *,
-                                       const shape_elem_type *,
-                                       const void *,
-                                       const size_t,
-                                       const size_t,
-                                       const shape_elem_type *,
-                                       const shape_elem_type *,
-                                       const void *,
-                                       const size_t,
-                                       const size_t,
-                                       const shape_elem_type *,
-                                       const shape_elem_type *,
-                                       const DPCTLEventVectorRef) =
-    dpnp_matmul_c<_DataType>;
-
 void func_map_init_linalg(func_map_t &fmap)
 {
     fmap[DPNPFuncName::DPNP_FN_ASTYPE][eft_BLN][eft_BLN] = {
@@ -1190,14 +1170,5 @@ void func_map_init_linalg(func_map_t &fmap)
     fmap[DPNPFuncName::DPNP_FN_MATMUL][eft_DBL][eft_DBL] = {
         eft_DBL, (void *)dpnp_matmul_default_c<double>};
 
-    fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_INT][eft_INT] = {
-        eft_INT, (void *)dpnp_matmul_ext_c<int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_LNG][eft_LNG] = {
-        eft_LNG, (void *)dpnp_matmul_ext_c<int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_matmul_ext_c<float>};
-    fmap[DPNPFuncName::DPNP_FN_MATMUL_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_matmul_ext_c<double>};
-
     return;
 }
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index cd3694bc004..40a350d53f5 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -86,8 +86,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_INV_EXT
         DPNP_FN_KRON
         DPNP_FN_KRON_EXT
-        DPNP_FN_MATMUL
-        DPNP_FN_MATMUL_EXT
         DPNP_FN_MATRIX_RANK
         DPNP_FN_MATRIX_RANK_EXT
         DPNP_FN_MAXIMUM
@@ -300,8 +298,6 @@ cpdef dpnp_descriptor dpnp_isclose(dpnp_descriptor input1, dpnp_descriptor input
 Linear algebra
 """
 cpdef dpnp_descriptor dpnp_dot(dpnp_descriptor in_array1, dpnp_descriptor in_array2)
-cpdef dpnp_descriptor dpnp_matmul(dpnp_descriptor in_array1, dpnp_descriptor in_array2, dpnp_descriptor out=*)
-
 
 """
 Array creation routines
diff --git a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
index 6614399f182..0691a543bab 100644
--- a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
+++ b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
@@ -39,7 +39,6 @@ __all__ += [
     "dpnp_dot",
     "dpnp_inner",
     "dpnp_kron",
-    "dpnp_matmul",
 ]
 
 
@@ -56,14 +55,6 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_dot_t)(c_dpctl.DPCTLSyclQueueR
                                                          void * , const size_t, const size_t,
                                                          const shape_elem_type *, const shape_elem_type * ,
                                                          const c_dpctl.DPCTLEventVectorRef) except +
-ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_matmul_t)(c_dpctl.DPCTLSyclQueueRef,
-                                                            void * , const size_t, const size_t,
-                                                            const shape_elem_type *, const shape_elem_type * ,
-                                                            void * , const size_t, const size_t,
-                                                            const shape_elem_type *, const shape_elem_type * ,
-                                                            void * , const size_t, const size_t,
-                                                            const shape_elem_type *, const shape_elem_type * ,
-                                                            const c_dpctl.DPCTLEventVectorRef)
 
 cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1,
                                      utils.dpnp_descriptor in_array2,
@@ -288,92 +279,3 @@ cpdef utils.dpnp_descriptor dpnp_kron(dpnp_descriptor in_array1, dpnp_descriptor
     c_dpctl.DPCTLEvent_Delete(event_ref)
 
     return result
-
-
-cpdef utils.dpnp_descriptor dpnp_matmul(utils.dpnp_descriptor in_array1, utils.dpnp_descriptor in_array2, utils.dpnp_descriptor out=None):
-
-    cdef shape_type_c shape_result
-
-    cdef shape_type_c shape1 = in_array1.shape
-    cdef shape_type_c shape2 = in_array2.shape
-
-    cdef size_t size_m = 0
-    cdef size_t size_n = 0
-    cdef size_t size_k = 0
-
-    # Calling this function on an empty container causes undefined behavior.
-    if not shape1.empty():
-        size_m = shape1.front()
-    if not shape2.empty():
-        size_n = shape2.back()
-    if not shape1.empty():
-        size_k = shape1.back()
-
-    cdef size_t ndim_max = max(in_array1.ndim, in_array2.ndim)
-
-    if in_array1.ndim < ndim_max or ndim_max == 1:
-        """
-        shape1(2,), shape2(2,4)
-        test: pytest tests/test_matmul.py::test_matmul[shape_pair4-types0] -v -s
-        or
-        shape1(2,), shape2(2,)
-        test: pytest tests/test_matmul.py::test_matmul[shape_pair8-types0] -v -s
-        """
-        size_m = 1
-
-    if in_array2.ndim < ndim_max or ndim_max == 1:
-        """
-        shape1(5,2), shape2(2,)
-        test: pytest tests/test_matmul.py::test_matmul[shape_pair6-types0] -v -s
-        or
-        shape1(3,), shape2(3,)
-        test: pytest tests/test_matmul.py::test_matmul[shape_pair8-types0] -v -s
-        """
-        size_n = 1
-
-    shape_result = shape1[:-1] + shape2[-1:]
-
-    # convert string type names (array.dtype) to C enum DPNPFuncType
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(in_array1.dtype)
-    cdef DPNPFuncType param2_type = dpnp_dtype_to_DPNPFuncType(in_array2.dtype)
-
-    # get the FPTR data structure
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_MATMUL_EXT, param1_type, param2_type)
-
-    # create result array with type given by FPTR data
-    result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(in_array1, in_array2)
-    cdef utils.dpnp_descriptor result = utils.create_output_descriptor(shape_result,
-                                                                       kernel_data.return_type,
-                                                                       out,
-                                                                       device=result_sycl_device,
-                                                                       usm_type=result_usm_type,
-                                                                       sycl_queue=result_sycl_queue)
-    if result.size == 0:
-        return result
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef fptr_2in_1out_matmul_t func = <fptr_2in_1out_matmul_t > kernel_data.ptr
-    # call FPTR function
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    result.get_data(),
-                                                    result.size,
-                                                    result.ndim,
-                                                    NULL,  # result_shape
-                                                    NULL,  # result_strides
-                                                    in_array1.get_data(),
-                                                    in_array1.size,
-                                                    in_array1.ndim,
-                                                    shape1.data(),
-                                                    NULL,  # in_array1_strides
-                                                    in_array2.get_data(),
-                                                    in_array2.size,
-                                                    in_array2.ndim,
-                                                    shape2.data(),
-                                                    NULL,  # in_array2_strides
-                                                    NULL)  # dep_event_vec_ref
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return result
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index 30b6134da17..d60863dd104 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -1,5 +1,3 @@
-# cython: language_level=3
-# distutils: language = c++
 # -*- coding: utf-8 -*-
 # *****************************************************************************
 # Copyright (c) 2016-2023, Intel Corporation
@@ -46,6 +44,7 @@
 import dpnp
 from dpnp.dpnp_algo import *
 from dpnp.dpnp_utils import *
+from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_matmul
 
 __all__ = [
     "dot",
@@ -246,7 +245,17 @@ def kron(x1, x2):
     return call_origin(numpy.kron, x1, x2)
 
 
-def matmul(x1, x2, out=None, **kwargs):
+def matmul(
+    x1,
+    x2,
+    /,
+    out=None,
+    *,
+    casting="same_kind",
+    order="K",
+    dtype=None,
+    subok=True,
+):
     """
     Matrix product of two arrays.
 
@@ -254,9 +263,9 @@ def matmul(x1, x2, out=None, **kwargs):
 
     Limitations
     -----------
-    Input arrays are supported as :obj:`dpnp.ndarray`.
-    Otherwise the function will be executed sequentially on CPU.
-    Parameter `out` is supported as :obj:`dpnp.ndarray` and as default value ``None``.
+    Input arrays and parameter `out` are supported as either :class:`dpnp.ndarray`
+    or :class:`dpctl.tensor.usm_ndarray`.
+    Keyword argument `subok` is currently unsupported.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
@@ -269,63 +278,65 @@ def matmul(x1, x2, out=None, **kwargs):
 
     Examples
     --------
+    For 2-D arrays it is the matrix product:
+
     >>> import dpnp as np
-    >>> a = np.ones([9, 5, 7, 4])
-    >>> c = np.ones([9, 5, 4, 3])
-    >>> np.matmul(a, c).shape
-    (9, 5, 7, 3)
     >>> a = np.array([[1, 0], [0, 1]])
     >>> b = np.array([[4, 1], [2, 2]])
     >>> np.matmul(a, b)
     array([[4, 1],
            [2, 2]])
 
-    """
+    For 2-D mixed with 1-D, the result is the usual.
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_nondefault_queue=False)
-    if x1_desc and x2_desc and not kwargs:
-        if x1_desc.ndim != 2 or x2_desc.ndim != 2:
-            pass
-        elif not x1_desc.ndim:
-            pass
-        elif not x2_desc.ndim:
-            pass
-        elif not x1_desc.size:
-            pass
-        elif not x2_desc.size:
-            pass
-        else:
-            if 0:
-                """
-                Cost model checks
-                """
-
-                array1_size = x1_desc.size
-                array2_size = x2_desc.size
-                cost_size = 4096  # 2D array shape(64, 64)
-
-                if (x1_desc.dtype == dpnp.float64) or (
-                    x1_desc.dtype == dpnp.float32
-                ):
-                    """
-                    Floating point types are handled via original math library better than SYCL math library
-                    """
-                    cost_size = 262144  # 2D array shape(512, 512)
-
-                if (array1_size > cost_size) and (array2_size > cost_size):
-                    return dpnp_matmul(x1_desc, x2_desc, out)
-            else:
-                out_desc = (
-                    dpnp.get_dpnp_descriptor(
-                        out, copy_when_nondefault_queue=False
-                    )
-                    if out is not None
-                    else None
-                )
-                return dpnp_matmul(x1_desc, x2_desc, out_desc).get_pyobj()
+    >>> a = np.array([[1, 0], [0, 1]])
+    >>> b = np.array([1, 2])
+    >>> np.matmul(a, b)
+    array([1, 2])
+    >>> np.matmul(b, a)
+    array([1, 2])
+
+    Broadcasting is conventional for stacks of arrays
 
-    return call_origin(numpy.matmul, x1, x2, out=out, **kwargs)
+    >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
+    >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
+    >>> np.matmul(a,b).shape
+    (2, 2, 2)
+    >>> np.matmul(a, b)[0, 1, 1]
+    array(98)
+    >>> np.sum(a[0, 1, :] * b[0 , :, 1])
+    array(98)
+
+    Vector, vector returns the scalar inner product, but neither argument is complex-conjugated:
+
+    >>> x1 = np.array([2j, 3j])
+    >>> x2 = np.array([2j, 3j])
+    >>> np.matmul(x1, x2)
+    array(-13+0j)
+
+    The ``@`` operator can be used as a shorthand for ``matmul`` on
+    :class:`dpnp.ndarray`.
+
+    >>> x1 @ x2
+    array(-13+0j)
+
+    """
+
+    dpnp.check_supported_arrays_type(x1)
+    dpnp.check_supported_arrays_type(x2)
+    if subok is False:
+        raise NotImplementedError(
+            "subok keyword argument is only supported by its default value."
+        )
+    else:
+        return dpnp_matmul(
+            x1,
+            x2,
+            out=out,
+            casting=casting,
+            order=order,
+            dtype=dtype,
+        )
 
 
 def outer(x1, x2, out=None):
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 9efb2aa04f1..dd2fa8f8c2d 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1380,7 +1380,9 @@ def result_type(*arrays_and_dtypes):
     """
 
     usm_arrays_and_dtypes = [
-        X.dtype if isinstance(X, (dpnp_array, dpt.usm_ndarray)) else X
+        dpnp.get_usm_ndarray(X)
+        if isinstance(X, (dpnp_array, dpt.usm_ndarray))
+        else X
         for X in arrays_and_dtypes
     ]
     return dpt.result_type(*usm_arrays_and_dtypes)
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
new file mode 100644
index 00000000000..d0add55eee3
--- /dev/null
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -0,0 +1,363 @@
+# *****************************************************************************
+# Copyright (c) 2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor._tensor_impl as ti
+import numpy
+
+import dpnp
+import dpnp.backend.extensions.blas._blas_impl as bi
+from dpnp.dpnp_utils import get_usm_allocations
+
+__all__ = ["dpnp_matmul"]
+
+
+def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue):
+    """
+    Determines the output array data type and the intermediate data type.
+
+    If dtype is ``None``, the output array data type is determined based on
+    the Promotion Type Rule and device capabilities. Otherwise, `dtype` is
+    used as output array dtype if input arrays can cast to it according to
+    the casting rule determined. If casting cannot be done, a ``TypeError``
+    is raised.
+    The intermediate data type is the data type used for performing matmul
+    operation calculations. If output array dtype is a floating-point data type,
+    it is also used for the intermediate data type. If output array dtype is an
+    integral data type, the default floating point data type of the device where
+    input arrays are allocated on are used for intermediate data type.
+
+    Parameters
+    ----------
+    arrays : {dpnp.ndarray, usm_ndarray}
+        Input arrays.
+    dtype : dtype
+        If not ``None``, data type of the output array.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.
+    sycl_queue : {SyclQueue}
+        A SYCL queue to use for determining default floating point datat type.
+
+    Returns
+    -------
+    gemm_dtype, res_dtype :
+        `gemm_dtype` is the data type used in performing matmul calculations.
+        The input arrays of matmul function are cast to `gemm_dtype` and then
+        the calculations are performed.
+        `res_dtype` is the output data type. When the result is obtained, it is cast
+        to `res_dtype`.
+
+    """
+
+    res_dtype = dpnp.result_type(*arrays)
+    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
+
+    if dtype is not None:
+        if dpnp.can_cast(res_dtype, dtype, casting=casting):
+            res_dtype = dtype
+        else:
+            raise TypeError(
+                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
+            )
+
+    gemm_dtype = (
+        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
+    )
+
+    return gemm_dtype, res_dtype
+
+
+def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
+    # If input array is F-contiguous, we need to change the order to C-contiguous.
+    # because mkl::gemm_bacth needs each 2D array to be F-contiguous but
+    # when the input array is F-contiguous, the data of 2D array
+    # that needs to be called in mkl::gemm_batch are not contiguous.
+    ht_tasks_list = []
+    x1 = _get_gemm_contig_array(x1, dev_tasks_list, ht_tasks_list)
+    x2 = _get_gemm_contig_array(x2, dev_tasks_list, ht_tasks_list)
+
+    x1_strides = x1.strides
+    x2_strides = x2.strides
+    res_strides = res.strides
+
+    # when shape along any particular dimension is 1,
+    # the stride along that dimension is not a
+    # meaningful number and is undefined. Here, we
+    # standardizing strides before continuing,
+    # setting stride to 0 if the shape along that axis is <=1
+    if x1_is_2D:
+        x1_strides = tuple(
+            str_i if sh_i > 1 else 0
+            for sh_i, str_i in zip(x1.shape, x1_strides)
+        )
+    if x2_is_2D:
+        x2_strides = tuple(
+            str_i if sh_i > 1 else 0
+            for sh_i, str_i in zip(x2.shape, x2_strides)
+        )
+
+    batch_size = res.shape[:-2][0]
+    stridea = x1_strides[0]
+    strideb = x2_strides[0]
+    stridec = res_strides[-3]
+
+    if x1.ndim > 3:
+        iter = ti._contract_iter2(
+            res.shape[:-2], x1_strides[:-2], x2_strides[:-2]
+        )
+
+        if len(iter[0]) != 1:
+            raise ValueError("Input arrays cannot be used in gemm_batch")
+        batch_size = iter[0][0]
+        stridea = iter[1][0]
+        strideb = iter[3][0]
+
+    ht_blas_ev, _ = bi._gemm_batch(
+        exec_q,
+        dpnp.get_usm_ndarray(x1),
+        dpnp.get_usm_ndarray(x2),
+        dpnp.get_usm_ndarray(res),
+        batch_size,
+        stridea,
+        strideb,
+        stridec,
+        dev_tasks_list,
+    )
+
+    return ht_blas_ev, ht_tasks_list, res
+
+
+def _get_gemm_contig_array(x, dep_events, host_events, dtype=None):
+    """
+    Creating a copy of input array if needed.
+
+    This function has two use cases. In the first use case, which is more general,
+    if the input array is not c-contiguous or f-contiguous, we ensure it becomes
+    c-contiguous. Additionally, if the input array has an integral dtype, we
+    convert it to an appropriate floating-point data type specified by `dtype`.
+    In the second use case, which is for N-dimensional arrays with N>2, we need
+    to ensure c-contiguity. This is crucial because the implementation of the
+    `gemm_batch` function in dpnp only works for C-contiguous arrays. This use case
+    is essential when the input array is f-contiguous with floating point dtype for
+    which the array is not modified in the first use case.
+
+    """
+
+    if dtype is None:
+        copy = not x.flags.c_contiguous
+    else:
+        copy = (
+            not (x.flags.c_contiguous or x.flags.f_contiguous)
+            or x.dtype != dtype
+        )
+
+    if copy:
+        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=dpnp.get_usm_ndarray(x),
+            dst=x_copy.get_array(),
+            sycl_queue=x.sycl_queue,
+        )
+        dep_events.append(copy_ev)
+        host_events.append(ht_copy_ev)
+        return x_copy
+    return x
+
+
+def dpnp_matmul(
+    x1,
+    x2,
+    /,
+    out=None,
+    *,
+    casting="same_kind",
+    order="K",
+    dtype=None,
+):
+    """
+    dpnp_matmul(x1, x2, out=None, casting="same_kind", order="K", dtype=None)
+
+    Return the matrix product of two arrays.
+
+    The main calculation is done by calling an extension function
+    for BLAS library of OneMKL. Depending on dimension of `x1` and `x2` arrays,
+    it will be either ``gemm`` (for one- and two-dimentional arrays) or
+    ``gemm_batch``(for others).
+
+    """
+
+    x1_ndim = x1.ndim
+    x2_ndim = x2.ndim
+
+    if x1_ndim == 0:
+        raise ValueError(
+            "input array 0 does not have enough dimensions (has 0, but requires at least 1)"
+        )
+    if x2_ndim == 0:
+        raise ValueError(
+            "input array 1 does not have enough dimensions (has 0, but requires at least 1)"
+        )
+
+    res_usm_type, exec_q = get_usm_allocations([x1, x2])
+
+    squeeze_flag = x1_ndim == 1 or x2_ndim == 1
+    if x1_ndim == 1:
+        x1 = x1[dpnp.newaxis, :]
+        x1_ndim = x1.ndim
+
+    if x2_ndim == 1:
+        x2 = x2[:, dpnp.newaxis]
+        x2_ndim = x2.ndim
+
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    if x1_shape[-1] != x2_shape[-2]:
+        raise ValueError(
+            "Input arrays have a mismatch in their core dimensions. "
+            "The core dimensions should follow this signature: (n?,k),(k,m?)->(n?,m?) "
+            f"(size {x1_shape[-1]} is different from {x2_shape[-2]})"
+        )
+
+    # Determine the appropriate data types
+    gemm_dtype, res_dtype = _gemm_res_dtype(
+        x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
+    )
+
+    x1_is_2D = x1_ndim == 2 or numpy.prod(x1_shape[:-2]) == 1  # inherently 2D
+    x2_is_2D = x2_ndim == 2 or numpy.prod(x2_shape[:-2]) == 1
+
+    # find the result shape
+    if x1_is_2D and x2_is_2D:
+        x1 = x1.reshape(x1.shape[-2], x1.shape[-1])
+        x2 = x2.reshape(x2.shape[-2], x2.shape[-1])
+        res_shape = (x1.shape[-2], x2.shape[-1])
+    else:
+        # makes the dimension of input the same by adding new axis
+        if x1_ndim != x2_ndim:
+            diff = abs(x1_ndim - x2_ndim)
+            if x1_ndim < x2_ndim:
+                x1 = x1.reshape((1,) * diff + x1.shape)
+                x1_ndim = x1.ndim
+                x1_shape = x1.shape
+            else:
+                x2 = x2.reshape((1,) * diff + x2.shape)
+                x2_ndim = x2.ndim
+                x2_shape = x2.shape
+
+        # examining the option to align inputs
+        # when their shapes differ but they are 1-D in some dimensions.
+        tmp_shape = list(x1_shape[:-2])
+        for i in range(x1_ndim - 2):
+            if x1_shape[i] != x2_shape[i]:
+                if x1_shape[i] == 1:
+                    tmp_shape[i] = x2_shape[i]
+                    # If the `x1` array is inherently 2D, there's no need to
+                    # duplicate the data for the 1-D dimension;
+                    # GEMM handles it automatically.
+                    if not x1_is_2D:
+                        x1 = dpnp.repeat(x1, x2_shape[i], axis=i)
+                elif x2_shape[i] == 1:
+                    tmp_shape[i] = x1_shape[i]
+                    if not x2_is_2D:
+                        x2 = dpnp.repeat(x2, x1_shape[i], axis=i)
+                else:
+                    raise ValueError(
+                        "arrays could not be broadcast together with remapped shapes."
+                    )
+        x1_shape = x1.shape
+        x2_shape = x2.shape
+        res_shape = tuple(tmp_shape) + (x1_shape[-2], x2_shape[-1])
+
+    # calculate results
+    result = dpnp.empty(
+        res_shape,
+        dtype=gemm_dtype,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    if result.size == 0:
+        pass
+    elif x1.size == 0 or x2.size == 0:
+        result.fill(0)
+    else:
+        # input arrays should have the proper data type
+        # and be C_CONTIGUOUS or F_CONTIGUOUS
+        dep_events_list = []
+        host_tasks_list = []
+        x1 = _get_gemm_contig_array(
+            x1, dep_events_list, host_tasks_list, gemm_dtype
+        )
+        x2 = _get_gemm_contig_array(
+            x2, dep_events_list, host_tasks_list, gemm_dtype
+        )
+
+        if x1_is_2D and x2_is_2D:
+            ht_blas_ev, _ = bi._gemm(
+                exec_q,
+                dpnp.get_usm_ndarray(x1),
+                dpnp.get_usm_ndarray(x2),
+                dpnp.get_usm_ndarray(result),
+                dep_events_list,
+            )
+        else:
+            (
+                ht_blas_ev,
+                ht_copy_ev,
+                result,
+            ) = _gemm_batch_matmul(
+                exec_q,
+                x1,
+                x2,
+                result,
+                x1_is_2D,
+                x2_is_2D,
+                dep_events_list,
+            )
+            host_tasks_list += ht_copy_ev
+
+        host_tasks_list.append(ht_blas_ev)
+        dpctl.SyclEvent.wait_for(host_tasks_list)
+
+    if squeeze_flag:
+        result = dpnp.squeeze(result)
+
+    if x1_is_2D and x2_is_2D:
+        # add new axes only if one of the input arrays
+        # was inehrently 2D
+        new_size = max(x1_ndim, x2_ndim)
+        for _ in range(new_size - 2):
+            result = result[dpnp.newaxis, :]
+
+    if gemm_dtype != res_dtype:
+        result = dpnp.astype(result, res_dtype, copy=False)
+    if out is None:
+        # If `order` was not passed as default
+        # we need to update it to match the passed `order`.
+        if order not in ["k", "K"]:
+            return dpnp.array(result, copy=False, order=order)
+        else:
+            return result
+    else:
+        return dpnp.get_result_array(result, out, casting=casting)
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 7ed82953541..2d15283a5bc 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -1,6 +1,3 @@
-# cython: language_level=3
-# distutils: language = c++
-# -*- coding: utf-8 -*-
 # *****************************************************************************
 # Copyright (c) 2023, Intel Corporation
 # All rights reserved.
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index b7a56c4c7db..e07a57beebe 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -1,6 +1,3 @@
-# cython: language_level=3
-# distutils: language = c++
-# -*- coding: utf-8 -*-
 # *****************************************************************************
 # Copyright (c) 2016-2023, Intel Corporation
 # All rights reserved.
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index bd44b974253..c6131541b6e 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1,6 +1,3 @@
-# cython: language_level=3
-# distutils: language = c++
-# -*- coding: utf-8 -*-
 # *****************************************************************************
 # Copyright (c) 2023, Intel Corporation
 # All rights reserved.
diff --git a/tests/test_manipulation.py b/tests/test_manipulation.py
index 576229b1d56..bb5533b0e62 100644
--- a/tests/test_manipulation.py
+++ b/tests/test_manipulation.py
@@ -8,6 +8,7 @@
     get_all_dtypes,
     get_complex_dtypes,
     get_float_dtypes,
+    has_support_aspect64,
 )
 
 testdata = []
@@ -71,11 +72,17 @@ def test_repeat(arr):
     assert_array_equal(expected, result)
 
 
+# TODO: Temporary skipping the test, until Internal CI is updated with
+# recent changed in dpctl regarding dpt.result_type function
+@pytest.mark.skip("Temporary skipping the test")
 def test_result_type():
-    X = [dpnp.ones((2), dtype=dpnp.int64), dpnp.int32, "float16"]
-    X_np = [numpy.ones((2), dtype=numpy.int64), numpy.int32, "float16"]
+    X = [dpnp.ones((2), dtype=dpnp.int64), dpnp.int32, "float32"]
+    X_np = [numpy.ones((2), dtype=numpy.int64), numpy.int32, "float32"]
 
-    assert dpnp.result_type(*X) == numpy.result_type(*X_np)
+    if has_support_aspect64():
+        assert dpnp.result_type(*X) == numpy.result_type(*X_np)
+    else:
+        assert dpnp.result_type(*X) == dpnp.default_float_type(X[0].device)
 
 
 def test_result_type_only_dtypes():
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index c521da5e0cb..6c95093a01b 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -1,5 +1,6 @@
 from itertools import permutations
 
+import dpctl
 import dpctl.tensor as dpt
 import numpy
 import pytest
@@ -2389,3 +2390,367 @@ def test_inplace_remainder(dtype):
     dp_a %= 4
 
     assert_allclose(dp_a, np_a)
+
+
+@pytest.mark.parametrize(
+    "dtype", get_all_dtypes(no_bool=True, no_none=True, no_complex=True)
+)
+def test_inplace_floor_divide(dtype):
+    size = 21
+    np_a = numpy.arange(size, dtype=dtype)
+    dp_a = dpnp.arange(size, dtype=dtype)
+
+    np_a //= 4
+    dp_a //= 4
+
+    assert_allclose(dp_a, np_a)
+
+
+class TestMatmul:
+    @pytest.mark.parametrize(
+        "order_pair", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")]
+    )
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((4,), (4,)),
+            ((4,), (4, 2)),
+            ((2, 4), (4,)),
+            ((2, 4), (4, 3)),
+            ((1, 2, 3), (1, 3, 5)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((1, 2, 3), (4, 3, 5)),
+            ((2, 3), (4, 3, 5)),
+            ((4, 2, 3), (1, 3, 5)),
+            ((4, 2, 3), (3, 5)),
+            ((1, 1, 4, 3), (1, 1, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+            ((6, 7, 4, 3), (1, 1, 3, 5)),
+            ((6, 7, 4, 3), (1, 3, 5)),
+            ((6, 7, 4, 3), (3, 5)),
+            ((6, 7, 4, 3), (1, 7, 3, 5)),
+            ((6, 7, 4, 3), (7, 3, 5)),
+            ((6, 7, 4, 3), (6, 1, 3, 5)),
+            ((1, 1, 4, 3), (6, 7, 3, 5)),
+            ((1, 4, 3), (6, 7, 3, 5)),
+            ((4, 3), (6, 7, 3, 5)),
+            ((6, 1, 4, 3), (6, 7, 3, 5)),
+            ((1, 7, 4, 3), (6, 7, 3, 5)),
+            ((7, 4, 3), (6, 7, 3, 5)),
+            ((1, 5, 3, 2), (6, 5, 2, 4)),
+            ((5, 3, 2), (6, 5, 2, 4)),
+            ((1, 3, 3), (10, 1, 3, 1)),
+        ],
+    )
+    def test_matmul(self, order_pair, shape_pair):
+        order1, order2 = order_pair
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+        a1 = numpy.array(a1, order=order1)
+        a2 = numpy.array(a2, order=order2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2)
+        expected = numpy.matmul(a1, a2)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize(
+        "order_pair", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")]
+    )
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 0), (0, 3)),
+            ((0, 4), (4, 3)),
+            ((2, 4), (4, 0)),
+            ((1, 2, 3), (0, 3, 5)),
+            ((0, 2, 3), (1, 3, 5)),
+            ((2, 3), (0, 3, 5)),
+            ((0, 2, 3), (3, 5)),
+            ((0, 0, 4, 3), (1, 1, 3, 5)),
+            ((6, 0, 4, 3), (1, 3, 5)),
+            ((0, 7, 4, 3), (3, 5)),
+            ((0, 7, 4, 3), (1, 7, 3, 5)),
+            ((0, 7, 4, 3), (7, 3, 5)),
+            ((6, 0, 4, 3), (6, 1, 3, 5)),
+            ((1, 1, 4, 3), (0, 0, 3, 5)),
+            ((1, 4, 3), (6, 0, 3, 5)),
+            ((4, 3), (0, 0, 3, 5)),
+            ((6, 1, 4, 3), (6, 0, 3, 5)),
+            ((1, 7, 4, 3), (0, 7, 3, 5)),
+            ((7, 4, 3), (0, 7, 3, 5)),
+        ],
+    )
+    def test_matmul_empty(self, order_pair, shape_pair):
+        order1, order2 = order_pair
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+        a1 = numpy.array(a1, order=order1)
+        a2 = numpy.array(a2, order=order2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2)
+        expected = numpy.matmul(a1, a2)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 4), (4, 3)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+        ],
+        ids=[
+            "((2, 4), (4, 3))",
+            "((4, 2, 3), (4, 3, 5))",
+            "((6, 7, 4, 3), (6, 7, 3, 5))",
+        ],
+    )
+    def test_matmul_bool(self, shape_pair):
+        shape1, shape2 = shape_pair
+        a1 = numpy.resize(
+            numpy.arange(2, dtype=numpy.bool_), numpy.prod(shape1)
+        ).reshape(shape1)
+        a2 = numpy.resize(
+            numpy.arange(2, dtype=numpy.bool_), numpy.prod(shape2)
+        ).reshape(shape2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2)
+        expected = numpy.matmul(a1, a2)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 4), (4, 3)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+        ],
+        ids=[
+            "((2, 4), (4, 3))",
+            "((4, 2, 3), (4, 3, 5))",
+            "((6, 7, 4, 3), (6, 7, 3, 5))",
+        ],
+    )
+    def test_matmul_dtype(self, dtype, shape_pair):
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2, dtype=dtype)
+        expected = numpy.matmul(a1, a2, dtype=dtype)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype2", get_all_dtypes(no_bool=True, no_none=True)
+    )
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 4), (4, 3)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+        ],
+        ids=[
+            "((2, 4), (4, 3))",
+            "((4, 2, 3), (4, 3, 5))",
+            "((6, 7, 4, 3), (6, 7, 3, 5))",
+        ],
+    )
+    def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2), dtype=dtype1).reshape(shape2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        if dpnp.can_cast(dpnp.result_type(b1, b2), dtype2, casting="same_kind"):
+            result = dpnp.matmul(b1, b2, dtype=dtype2)
+            expected = numpy.matmul(a1, a2, dtype=dtype2)
+            assert_dtype_allclose(result, expected)
+        else:
+            with pytest.raises(TypeError):
+                dpnp.matmul(b1, b2, dtype=dtype2)
+
+    # TODO: Temporary skipping the test, until Internal CI is updated with
+    # recent changed in dpctl regarding dpt.result_type function
+    @pytest.mark.skip("Temporary skipping the test")
+    @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype2", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 4), (4, 3)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+        ],
+        ids=[
+            "((2, 4), (4, 3))",
+            "((4, 2, 3), (4, 3, 5))",
+            "((6, 7, 4, 3), (6, 7, 3, 5))",
+        ],
+    )
+    def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair):
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2), dtype=dtype2).reshape(shape2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2)
+        expected = numpy.matmul(a1, a2)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("order", ["C", "F", "K", "A"])
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((2, 4), (4, 3)),
+            ((4, 2, 3), (4, 3, 5)),
+            ((6, 7, 4, 3), (6, 7, 3, 5)),
+        ],
+        ids=[
+            "((2, 4), (4, 3))",
+            "((4, 2, 3), (4, 3, 5))",
+            "((6, 7, 4, 3), (6, 7, 3, 5))",
+        ],
+    )
+    def test_matmul_order(self, order, shape_pair):
+        shape1, shape2 = shape_pair
+        a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+        a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        result = dpnp.matmul(b1, b2, order=order)
+        expected = numpy.matmul(a1, a2, order=order)
+        assert result.flags.c_contiguous == expected.flags.c_contiguous
+        assert result.flags.f_contiguous == expected.flags.f_contiguous
+        assert_dtype_allclose(result, expected)
+
+    def test_matmul_strided(self):
+        for dim in [1, 2, 3, 4]:
+            A = numpy.random.rand(*([20] * dim))
+            B = dpnp.asarray(A)
+            # positive strides
+            slices = tuple(slice(None, None, 2) for _ in range(dim))
+            a = A[slices]
+            b = B[slices]
+
+            result = dpnp.matmul(b, b)
+            expected = numpy.matmul(a, a)
+            assert_dtype_allclose(result, expected)
+
+            # negative strides
+            slices = tuple(slice(None, None, -2) for _ in range(dim))
+            a = A[slices]
+            b = B[slices]
+
+            result = dpnp.matmul(b, b)
+            expected = numpy.matmul(a, a)
+            assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_matmul_out(self, dtype):
+        a1 = numpy.arange(5 * 4, dtype=dtype).reshape(5, 4)
+        a2 = numpy.arange(7 * 4, dtype=dtype).reshape(4, 7)
+
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        dpnp_out = dpnp.empty((5, 7), dtype=dtype)
+        result = dpnp.matmul(b1, b2, out=dpnp_out)
+        expected = numpy.matmul(a1, a2)
+        assert result is dpnp_out
+        assert_dtype_allclose(result, expected)
+
+
+class TestMatmulInvalidCases:
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((3, 2), ()),
+            ((), (3, 2)),
+            ((), ()),
+        ],
+    )
+    def test_zero_dim(self, shape_pair):
+        for xp in (numpy, dpnp):
+            shape1, shape2 = shape_pair
+            x1 = xp.arange(numpy.prod(shape1), dtype=xp.float32).reshape(shape1)
+            x2 = xp.arange(numpy.prod(shape2), dtype=xp.float32).reshape(shape2)
+            with pytest.raises(ValueError):
+                xp.matmul(x1, x2)
+
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((5, 3, 1), (3, 1, 4)),
+            ((3, 2, 3), (3, 2, 4)),
+            ((3, 2), (1,)),
+            ((1, 2), (3, 1)),
+            ((4, 3, 2), (6, 5, 2, 4)),
+            ((6, 5, 3, 2), (3, 2, 4)),
+        ],
+    )
+    def test_invalid_shape(self, shape_pair):
+        for xp in (numpy, dpnp):
+            shape1, shape2 = shape_pair
+            x1 = xp.arange(numpy.prod(shape1), dtype=xp.float32).reshape(shape1)
+            x2 = xp.arange(numpy.prod(shape2), dtype=xp.float32).reshape(shape2)
+            with pytest.raises(ValueError):
+                xp.matmul(x1, x2)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True)[:-2])
+    def test_invalid_dtype(self, dtype):
+        dpnp_dtype = get_all_dtypes(no_none=True)[-1]
+        a1 = dpnp.arange(5 * 4, dtype=dpnp_dtype).reshape(5, 4)
+        a2 = dpnp.arange(7 * 4, dtype=dpnp_dtype).reshape(4, 7)
+        dp_out = dpnp.empty((5, 7), dtype=dtype)
+
+        with pytest.raises(TypeError):
+            dpnp.matmul(a1, a2, out=dp_out)
+
+    def test_exe_q(self):
+        x1 = dpnp.ones((5, 4), sycl_queue=dpctl.SyclQueue())
+        x2 = dpnp.ones((4, 7), sycl_queue=dpctl.SyclQueue())
+
+        with pytest.raises(ValueError):
+            dpnp.matmul(x1, x2)
+
+    # TODO: Temporary skipping the test, until Internal CI is updated with
+    # recent changed in dpctl regarding dpt.result_type function
+    @pytest.mark.skip("Temporary skipping the test")
+    def test_matmul_casting(self):
+        a1 = dpnp.arange(2 * 4, dtype=dpnp.float32).reshape(2, 4)
+        a2 = dpnp.arange(4 * 3).reshape(4, 3)
+
+        res = dpnp.empty((2, 3), dtype=dpnp.int64)
+        with pytest.raises(TypeError):
+            dpnp.matmul(a1, a2, out=res, casting="safe")
+
+    def test_matmul_subok(self):
+        a1 = dpnp.arange(2 * 4).reshape(2, 4)
+        a2 = dpnp.arange(4 * 3).reshape(4, 3)
+
+        with pytest.raises(NotImplementedError):
+            dpnp.matmul(a1, a2, subok=False)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index de762ecef93..fb94f1cd4cb 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -666,6 +666,41 @@ def test_2in_1out_diff_queue_but_equal_context(func, device):
         getattr(dpnp, func)(x1, x2)
 
 
+@pytest.mark.parametrize(
+    "device",
+    valid_devices,
+    ids=[device.filter_string for device in valid_devices],
+)
+@pytest.mark.parametrize(
+    "shape_pair",
+    [
+        ((2, 4), (4, 3)),
+        ((4, 2, 3), (4, 3, 5)),
+        ((6, 7, 4, 3), (6, 7, 3, 5)),
+    ],
+    ids=[
+        "((2, 4), (4, 3))",
+        "((4, 2, 3), (4, 3, 5))",
+        "((6, 7, 4, 3), (6, 7, 3, 5))",
+    ],
+)
+def test_matmul(device, shape_pair):
+    shape1, shape2 = shape_pair
+    a1 = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+    a2 = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+
+    b1 = dpnp.asarray(a1, device=device)
+    b2 = dpnp.asarray(a2, device=device)
+
+    result = dpnp.matmul(b1, b2)
+    expected = numpy.matmul(a1, a2)
+    assert_allclose(expected, result)
+
+    result_queue = result.sycl_queue
+    assert_sycl_queue_equal(result_queue, b1.sycl_queue)
+    assert_sycl_queue_equal(result_queue, b2.sycl_queue)
+
+
 @pytest.mark.parametrize(
     "func, kwargs",
     [
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 6b46dcceb1e..0b094920c01 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -352,6 +352,39 @@ def test_coerced_usm_types_bitwise_op(op, usm_type_x, usm_type_y):
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "shape_pair",
+    [
+        ((2, 4), (4, 3)),
+        ((2, 0), (0, 3)),
+        ((2, 4), (4, 0)),
+        ((4, 2, 3), (4, 3, 5)),
+        ((6, 7, 4, 3), (6, 7, 3, 5)),
+    ],
+    ids=[
+        "((2, 4), (4, 3))",
+        "((2, 0), (0, 3))",
+        "((2, 4), (4, 0))",
+        "((4, 2, 3), (4, 3, 5))",
+        "((6, 7, 4, 3), (6, 7, 3, 5))",
+    ],
+)
+def test_matmul(usm_type_x, usm_type_y, shape_pair):
+    shape1, shape2 = shape_pair
+    x = numpy.arange(numpy.prod(shape1)).reshape(shape1)
+    y = numpy.arange(numpy.prod(shape2)).reshape(shape2)
+
+    x = dp.array(x, usm_type=usm_type_x)
+    y = dp.array(y, usm_type=usm_type_y)
+    z = dp.matmul(x, y)
+
+    assert x.usm_type == usm_type_x
+    assert y.usm_type == usm_type_y
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+
+
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
 @pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
 def test_meshgrid(usm_type_x, usm_type_y):
diff --git a/tests/third_party/cupy/math_tests/test_matmul.py b/tests/third_party/cupy/math_tests/test_matmul.py
index d0f3555373a..d21ec7a2d68 100644
--- a/tests/third_party/cupy/math_tests/test_matmul.py
+++ b/tests/third_party/cupy/math_tests/test_matmul.py
@@ -25,40 +25,38 @@
                 ((0,), (0,)),
                 # matmul test
                 ((5, 3, 2), (5, 2, 4)),
-                # ((0, 3, 2), (0, 2, 4)),
-                # ((5, 3, 2), (2, 4)),
-                # ((0, 3, 2), (2, 4)),
-                # ((3, 2), (5, 2, 4)),
-                # ((3, 2), (0, 2, 4)),
-                # ((5, 3, 2), (1, 2, 4)),
-                # ((0, 3, 2), (1, 2, 4)),
-                # ((1, 3, 2), (5, 2, 4)),
-                # ((1, 3, 2), (0, 2, 4)),
-                # ((5, 3, 2), (2,)),
-                # ((5, 3, 0), (0,)),
-                # ((2,), (5, 2, 4)),
-                # ((0,), (5, 0, 4)),
-                # ((2, 2, 3, 2), (2, 2, 2, 4)),
-                # ((5, 0, 3, 2), (5, 0, 2, 4)),
-                # ((6, 5, 3, 2), (2, 4)),
-                # ((5, 0, 3, 2), (2, 4)),
-                # ((3, 2), (6, 5, 2, 4)),
-                # ((3, 2), (5, 0, 2, 4)),
-                # ((1, 5, 3, 2), (6, 1, 2, 4)),
-                # ((1, 0, 3, 2), (6, 1, 2, 4)),
-                # ((6, 1, 3, 2), (1, 5, 2, 4)),
-                # ((6, 1, 3, 2), (1, 0, 2, 4)),
-                # ((6, 5, 3, 2), (2,)),
-                # ((6, 5, 3, 0), (0,)),
-                # ((2,), (6, 5, 2, 4)),
-                # ((0,), (6, 5, 0, 4)),
+                ((0, 3, 2), (0, 2, 4)),
+                ((5, 3, 2), (2, 4)),
+                ((0, 3, 2), (2, 4)),
+                ((3, 2), (5, 2, 4)),
+                ((3, 2), (0, 2, 4)),
+                ((5, 3, 2), (1, 2, 4)),
+                ((0, 3, 2), (1, 2, 4)),
+                ((1, 3, 2), (5, 2, 4)),
+                ((1, 3, 2), (0, 2, 4)),
+                ((5, 3, 2), (2,)),
+                ((5, 3, 0), (0,)),
+                ((2,), (5, 2, 4)),
+                ((0,), (5, 0, 4)),
+                ((2, 2, 3, 2), (2, 2, 2, 4)),
+                ((5, 0, 3, 2), (5, 0, 2, 4)),
+                ((6, 5, 3, 2), (2, 4)),
+                ((5, 0, 3, 2), (2, 4)),
+                ((3, 2), (6, 5, 2, 4)),
+                ((3, 2), (5, 0, 2, 4)),
+                ((1, 5, 3, 2), (6, 1, 2, 4)),
+                ((1, 0, 3, 2), (6, 1, 2, 4)),
+                ((6, 1, 3, 2), (1, 5, 2, 4)),
+                ((6, 1, 3, 2), (1, 0, 2, 4)),
+                ((6, 5, 3, 2), (2,)),
+                ((6, 5, 3, 0), (0,)),
+                ((2,), (6, 5, 2, 4)),
+                ((0,), (6, 5, 0, 4)),
                 ((1, 3, 3), (10, 1, 3, 1)),
             ],
         }
     )
 )
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@testing.gpu
 class TestMatmul(unittest.TestCase):
     @testing.for_all_dtypes(name="dtype1")
     @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3)  # required for uint8
@@ -94,8 +92,6 @@ def test_cupy_matmul(self, xp, dtype1):
         }
     )
 )
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@testing.gpu
 class TestMatmulLarge(unittest.TestCase):
     # Avoid overflow
     skip_dtypes = {
@@ -151,8 +147,6 @@ def test_cupy_matmul(self, xp, dtype1):
         }
     )
 )
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@testing.gpu
 class TestMatmulInvalidShape(unittest.TestCase):
     def test_invalid_shape(self):
         for xp in (numpy, dpnp):