IntelPython · antonwolfy · Jun 15, 2024 · Jun 6, 2024 · Jun 10, 2024 · Jun 10, 2024
@@ -821,7 +821,6 @@ def matmul(
 
     """
 
-    dpnp.check_supported_arrays_type(x1, x2)
     if subok is False:
         raise NotImplementedError(
             "subok keyword argument is only supported by its default value."

@@ -33,6 +33,7 @@
 import dpctl.tensor._tensor_elementwise_impl as tei
 import dpctl.tensor._tensor_impl as ti
 import numpy
+from dpctl.utils import ExecutionPlacementError
 from numpy.core.numeric import normalize_axis_tuple
 
 import dpnp
@@ -218,7 +219,9 @@ def _compute_size(start, shape):
     return ret
 
 
-def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None):
+def _copy_array(
+    x, dep_events, host_events, copy_flag=False, dtype=None, order="C"
+):
     """
     Creating a copy of input array if needed.
 
@@ -236,7 +239,7 @@ def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None):
         copy = x.dtype != dtype if dtype is not None else False
 
     if copy:
-        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
+        x_copy = dpnp.empty_like(x, dtype=dtype, order=order)
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=dpnp.get_usm_ndarray(x),
             dst=x_copy.get_array(),
@@ -248,7 +251,9 @@ def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None):
     return x
 
 
-def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue):
+def _create_result_array(
+    x1, x2, out, shape, dtype, usm_type, sycl_queue, order="C"
+):
     """
     Create the result array.
 
@@ -263,13 +268,12 @@ def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue):
         x1_usm = dpnp.get_usm_ndarray(x1)
         x2_usm = dpnp.get_usm_ndarray(x2)
         out_usm = dpnp.get_usm_ndarray(out)
-        contig_flag = _define_contig_flag(out)
+        contig_flag, _, _ = _define_contig_flag(out)
 
         if (
             out.dtype == dtype
             and out.shape == shape
             and out.usm_type == usm_type
-            and out.sycl_queue == sycl_queue
             and contig_flag
             and not ti._array_overlap(x1_usm, out_usm)
             and not ti._array_overlap(x2_usm, out_usm)
@@ -279,6 +283,7 @@ def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue):
     return dpnp.empty(
         shape,
         dtype=dtype,
+        order=order,
         usm_type=usm_type,
         sycl_queue=sycl_queue,
     )
@@ -295,14 +300,14 @@ def _define_contig_flag(x):
     x_strides = x.strides
     x_shape = x.shape
     if x.ndim < 2:
-        return True
+        return True, True, True
 
     x_strides = _standardize_strides_to_nonzero(x_strides, x_shape)
     x_is_c_contiguous = x_strides[-1] == 1 and x_strides[-2] == x_shape[-1]
     x_is_f_contiguous = x_strides[-2] == 1 and x_strides[-1] == x_shape[-2]
     if x_is_c_contiguous or x_is_f_contiguous:
         flag = True
-    return flag
+    return flag, x_is_c_contiguous, x_is_f_contiguous
 
 
 def _define_dim_flags(x, pos):
@@ -746,17 +751,24 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, dev_tasks_list):
         )
         ht_tasks_list.append(ht_blas_ev)
     dpctl.SyclEvent.wait_for(ht_tasks_list)
+
     res_shape = res.shape
-    if not row_major:
-        res = dpnp.reshape(
-            res.ravel(), (batch_size, res_shape[2], res_shape[1])
-        ).transpose(0, 2, 1)
+    _, res_is_c_contig, res_is_f_contig = _define_contig_flag(res)
+    if row_major:
+        if res_is_f_contig:
+            res = dpnp.reshape(
+                res.ravel("F"), (res_shape[1], res_shape[2], batch_size)
+            ).transpose(2, 0, 1)
+    else:
+        if res_is_c_contig:
+            res = dpnp.reshape(
+                res.ravel(), (batch_size, res_shape[2], res_shape[1])
+            ).transpose(0, 2, 1)
 
     if res_shape != orig_shape:
         res = res.reshape(orig_shape)
 
-    res = dpnp.ascontiguousarray(res)
-    return res
+    return dpnp.ascontiguousarray(res)
 
 
 def _gemm_matmul(exec_q, x1, x2, res, dev_tasks_list):
@@ -769,14 +781,16 @@ def _gemm_matmul(exec_q, x1, x2, res, dev_tasks_list):
     )
     ht_blas_ev.wait()
 
-    if not row_major:
-        # TODO: investigate the possibility of defining result
-        # array with "F" order for this case
-        res = dpnp.ascontiguousarray(
-            dpnp.reshape(res.ravel(), res.shape, order="F")
-        )
+    if row_major:
+        if res.flags.f_contiguous is True:
+            # read data in "F" order and write it in "C" order
+            res = dpnp.reshape(res.ravel(order="F"), res.shape, order="C")
+    else:
+        if res.flags.c_contiguous is True:
+            # read data in "C" order and write it in "F" order
+            res = dpnp.reshape(res.ravel(), res.shape, order="F")
 
-    return res
+    return dpnp.ascontiguousarray(res)
 
 
 def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
@@ -1746,6 +1760,13 @@ def dpnp_dot(a, b, /, out=None, *, conjugate=False):
         )
 
     res_usm_type, exec_q = get_usm_allocations([a, b])
+    if (
+        out is not None
+        and dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None
+    ):
+        raise ExecutionPlacementError(
+            "Input and output allocation queues are not compatible"
+        )
 
     # Determine the appropriate data types
     dot_dtype, res_dtype = _compute_res_dtype(a, b, sycl_queue=exec_q)
@@ -1812,6 +1833,12 @@ def dpnp_einsum(
             arrays.append(a)
 
     res_usm_type, exec_q = get_usm_allocations(arrays)
+    if out is not None:
+        dpnp.check_supported_arrays_type(out)
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
     result_dtype = dpnp.result_type(*arrays) if dtype is None else dtype
     for id, a in enumerate(operands):
         if dpnp.isscalar(a):
@@ -2056,10 +2083,17 @@ def dpnp_matmul(
 
     """
 
-    x1_ndim = x1.ndim
-    x2_ndim = x2.ndim
+    dpnp.check_supported_arrays_type(x1, x2)
     res_usm_type, exec_q = get_usm_allocations([x1, x2])
+    if out is not None:
+        dpnp.check_supported_arrays_type(out)
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
 
+    x1_ndim = x1.ndim
+    x2_ndim = x2.ndim
     if axes is not None:
         axes = _validate_axes(x1, x2, axes)
 
@@ -2072,7 +2106,6 @@ def dpnp_matmul(
         x2 = dpnp.moveaxis(x2, axes_x2, (-2, -1)) if x2_ndim != 1 else x2
         out_orig = out
         if out is not None:
-            dpnp.check_supported_arrays_type(out)
             # out that is passed to the backend should have the correct shape
             if len(axes_res) == 2:
                 out = dpnp.moveaxis(out, axes_res, (-2, -1))
@@ -2161,8 +2194,18 @@ def dpnp_matmul(
             res = dpnp_dot(x1, x2, out=out)
         res_shape = res.shape
     else:
+        x1_contig_flag, _, x1_f = _define_contig_flag(x1)
+        x2_contig_flag, _, x2_f = _define_contig_flag(x2)
+        res_order = "F" if (x1_f and x2_f and call_flag == "gemm") else "C"
         res = _create_result_array(
-            x1, x2, out, res_shape, compute_dtype, res_usm_type, exec_q
+            x1,
+            x2,
+            out,
+            res_shape,
+            compute_dtype,
+            res_usm_type,
+            exec_q,
+            res_order,
         )
 
         # calculate result
@@ -2175,21 +2218,21 @@ def dpnp_matmul(
             # their base (last 2-dimensions) to be c-contiguous or f-contiguous
             dep_events_list = []
             host_tasks_list = []
-            contig_flag = _define_contig_flag(x1)
             x1 = _copy_array(
                 x1,
                 dep_events_list,
                 host_tasks_list,
-                copy_flag=not contig_flag,
+                copy_flag=not x1_contig_flag,
                 dtype=compute_dtype,
+                order=res_order,
             )
-            contig_flag = _define_contig_flag(x2)
             x2 = _copy_array(
                 x2,
                 dep_events_list,
                 host_tasks_list,
-                copy_flag=not contig_flag,
+                copy_flag=not x2_contig_flag,
                 dtype=compute_dtype,
+                order=res_order,
             )
 
             if call_flag == "gemv":

@@ -619,6 +619,12 @@ def test_einsum_error(self):
         with pytest.raises(TypeError):
             inp.einsum("ii->i", a, copy=False)
 
+        a = inp.ones((5, 5))
+        out = inp.empty((5,), sycl_queue=dpctl.SyclQueue())
+        # inconsistent sycl_queue
+        with pytest.raises(ExecutionPlacementError):
+            inp.einsum("ii->i", a, out=out)
+
         # unknown value for optimize keyword
         with pytest.raises(TypeError):
             inp.einsum("ii->i", a, optimize="average")

@@ -2,6 +2,7 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
+from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_almost_equal,
@@ -2975,20 +2976,96 @@ def test_matmul_strided_vec_mat(self, shape, incx, incy, transpose):
         assert result is out
         assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize(
+        "order_pair",
+        [
+            ("C", "C", "C"),
+            ("C", "C", "F"),
+            ("C", "F", "C"),
+            ("C", "F", "F"),
+            ("F", "C", "C"),
+            ("F", "C", "F"),
+            ("F", "F", "F"),
+            ("F", "F", "C"),
+        ],
+    )
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_none=True, no_bool=True)
     )
-    def test_matmul_out(self, dtype):
-        a1 = numpy.arange(5 * 4, dtype=dtype).reshape(5, 4)
-        a2 = numpy.arange(7 * 4, dtype=dtype).reshape(4, 7)
+    def test_matmul_out1(self, order_pair, dtype):
+        order1, order2, out_order = order_pair
+        a1 = numpy.arange(20, dtype=dtype).reshape(5, 4)
+        a2 = numpy.arange(28, dtype=dtype).reshape(4, 7)
+
+        a1 = numpy.asarray(a1, order=order1)
+        a2 = numpy.asarray(a2, order=order2)
 
         b1 = dpnp.asarray(a1)
         b2 = dpnp.asarray(a2)
 
-        dpnp_out = dpnp.empty((5, 7), dtype=dtype)
+        dpnp_out = dpnp.empty((5, 7), dtype=dtype, order=out_order)
+        result = dpnp.matmul(b1, b2, out=dpnp_out)
+        assert result is dpnp_out
+
+        out = numpy.empty((5, 7), dtype=dtype, order=out_order)
+        expected = numpy.matmul(a1, a2, out=out)
+        assert result.flags.c_contiguous == expected.flags.c_contiguous
+        assert result.flags.f_contiguous == expected.flags.f_contiguous
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("trans", [True, False])
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_matmul_out2(self, trans, dtype):
+        a1 = numpy.arange(24, dtype=dtype).reshape(2, 3, 4)
+        a2 = numpy.arange(40, dtype=dtype).reshape(2, 4, 5)
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        if trans:
+            dpnp_out = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
+            out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
+        else:
+            dpnp_out = dpnp.empty((2, 3, 5), dtype=dtype)
+            out = numpy.empty((2, 3, 5), dtype=dtype)
+
         result = dpnp.matmul(b1, b2, out=dpnp_out)
-        expected = numpy.matmul(a1, a2)
         assert result is dpnp_out
+
+        expected = numpy.matmul(a1, a2, out=out)
+        assert result.flags.c_contiguous == expected.flags.c_contiguous
+        assert result.flags.f_contiguous == expected.flags.f_contiguous
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("trans", [True, False])
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_matmul_out3(self, trans, dtype):
+        a1 = numpy.arange(24, dtype=dtype).reshape(2, 4, 3)
+        a2 = numpy.arange(40, dtype=dtype).reshape(2, 5, 4)
+        b1 = dpnp.asarray(a1)
+        b2 = dpnp.asarray(a2)
+
+        a1 = numpy.asarray(a1).transpose(0, 2, 1)
+        a2 = numpy.asarray(a2).transpose(0, 2, 1)
+        b1 = b1.transpose(0, 2, 1)
+        b2 = b2.transpose(0, 2, 1)
+
+        if trans:
+            dpnp_out = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
+            out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
+        else:
+            dpnp_out = dpnp.empty((2, 3, 5), dtype=dtype)
+            out = numpy.empty((2, 3, 5), dtype=dtype)
+
+        result = dpnp.matmul(b1, b2, out=dpnp_out)
+        assert result is dpnp_out
+
+        expected = numpy.matmul(a1, a2, out=out)
+        assert result.flags.c_contiguous == expected.flags.c_contiguous
+        assert result.flags.f_contiguous == expected.flags.f_contiguous
         assert_dtype_allclose(result, expected)
 
     @pytest.mark.parametrize(
@@ -3107,10 +3184,15 @@ def test_invalid_dtype(self, dtype):
     def test_exe_q(self):
         x1 = dpnp.ones((5, 4), sycl_queue=dpctl.SyclQueue())
         x2 = dpnp.ones((4, 7), sycl_queue=dpctl.SyclQueue())
-
         with pytest.raises(ValueError):
             dpnp.matmul(x1, x2)
 
+        x1 = dpnp.ones((5, 4))
+        x2 = dpnp.ones((4, 7))
+        out = dpnp.empty((5, 7), sycl_queue=dpctl.SyclQueue())
+        with pytest.raises(ExecutionPlacementError):
+            dpnp.matmul(x1, x2, out=out)
+
     def test_matmul_casting(self):
         a1 = dpnp.arange(2 * 4, dtype=dpnp.float32).reshape(2, 4)
         a2 = dpnp.arange(4 * 3).reshape(4, 3)