diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py index 1af952388a6..f674c96040a 100644 --- a/dpnp/dpnp_iface_linearalgebra.py +++ b/dpnp/dpnp_iface_linearalgebra.py @@ -821,7 +821,6 @@ def matmul( """ - dpnp.check_supported_arrays_type(x1, x2) if subok is False: raise NotImplementedError( "subok keyword argument is only supported by its default value." diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 0b9686771c3..c98acc2c81e 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -33,6 +33,7 @@ import dpctl.tensor._tensor_elementwise_impl as tei import dpctl.tensor._tensor_impl as ti import numpy +from dpctl.utils import ExecutionPlacementError from numpy.core.numeric import normalize_axis_tuple import dpnp @@ -218,7 +219,9 @@ def _compute_size(start, shape): return ret -def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None): +def _copy_array( + x, dep_events, host_events, copy_flag=False, dtype=None, order="C" +): """ Creating a copy of input array if needed. @@ -236,7 +239,7 @@ def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None): copy = x.dtype != dtype if dtype is not None else False if copy: - x_copy = dpnp.empty_like(x, dtype=dtype, order="C") + x_copy = dpnp.empty_like(x, dtype=dtype, order=order) ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(x), dst=x_copy.get_array(), @@ -248,7 +251,9 @@ def _copy_array(x, dep_events, host_events, copy_flag=False, dtype=None): return x -def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue): +def _create_result_array( + x1, x2, out, shape, dtype, usm_type, sycl_queue, order="C" +): """ Create the result array. @@ -263,13 +268,12 @@ def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue): x1_usm = dpnp.get_usm_ndarray(x1) x2_usm = dpnp.get_usm_ndarray(x2) out_usm = dpnp.get_usm_ndarray(out) - contig_flag = _define_contig_flag(out) + contig_flag, _, _ = _define_contig_flag(out) if ( out.dtype == dtype and out.shape == shape and out.usm_type == usm_type - and out.sycl_queue == sycl_queue and contig_flag and not ti._array_overlap(x1_usm, out_usm) and not ti._array_overlap(x2_usm, out_usm) @@ -279,6 +283,7 @@ def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue): return dpnp.empty( shape, dtype=dtype, + order=order, usm_type=usm_type, sycl_queue=sycl_queue, ) @@ -295,14 +300,14 @@ def _define_contig_flag(x): x_strides = x.strides x_shape = x.shape if x.ndim < 2: - return True + return True, True, True x_strides = _standardize_strides_to_nonzero(x_strides, x_shape) x_is_c_contiguous = x_strides[-1] == 1 and x_strides[-2] == x_shape[-1] x_is_f_contiguous = x_strides[-2] == 1 and x_strides[-1] == x_shape[-2] if x_is_c_contiguous or x_is_f_contiguous: flag = True - return flag + return flag, x_is_c_contiguous, x_is_f_contiguous def _define_dim_flags(x, pos): @@ -746,17 +751,26 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, dev_tasks_list): ) ht_tasks_list.append(ht_blas_ev) dpctl.SyclEvent.wait_for(ht_tasks_list) + res_shape = res.shape - if not row_major: - res = dpnp.reshape( - res.ravel(), (batch_size, res_shape[2], res_shape[1]) - ).transpose(0, 2, 1) + _, res_is_c_contig, res_is_f_contig = _define_contig_flag(res) + if row_major: + if res_is_f_contig: + res = dpnp.reshape( + dpnp.ravel(res, order="F"), + (res_shape[1], res_shape[2], batch_size), + ).transpose(2, 0, 1) + else: + if res_is_c_contig: + res = dpnp.reshape( + dpnp.ravel(res, order="C"), + (batch_size, res_shape[2], res_shape[1]), + ).transpose(0, 2, 1) if res_shape != orig_shape: res = res.reshape(orig_shape) - res = dpnp.ascontiguousarray(res) - return res + return dpnp.ascontiguousarray(res) def _gemm_matmul(exec_q, x1, x2, res, dev_tasks_list): @@ -769,14 +783,16 @@ def _gemm_matmul(exec_q, x1, x2, res, dev_tasks_list): ) ht_blas_ev.wait() - if not row_major: - # TODO: investigate the possibility of defining result - # array with "F" order for this case - res = dpnp.ascontiguousarray( - dpnp.reshape(res.ravel(), res.shape, order="F") - ) + if row_major: + if res.flags.f_contiguous is True: + # read data in "F" order and write it in "C" order + res = dpnp.reshape(dpnp.ravel(res, order="F"), res.shape, order="C") + else: + if res.flags.c_contiguous is True: + # read data in "C" order and write it in "F" order + res = dpnp.reshape(dpnp.ravel(res, order="C"), res.shape, order="F") - return res + return dpnp.ascontiguousarray(res) def _greedy_path(input_sets, output_set, idx_dict, memory_limit): @@ -1746,6 +1762,13 @@ def dpnp_dot(a, b, /, out=None, *, conjugate=False): ) res_usm_type, exec_q = get_usm_allocations([a, b]) + if ( + out is not None + and dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) # Determine the appropriate data types dot_dtype, res_dtype = _compute_res_dtype(a, b, sycl_queue=exec_q) @@ -1812,6 +1835,12 @@ def dpnp_einsum( arrays.append(a) res_usm_type, exec_q = get_usm_allocations(arrays) + if out is not None: + dpnp.check_supported_arrays_type(out) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) result_dtype = dpnp.result_type(*arrays) if dtype is None else dtype for id, a in enumerate(operands): if dpnp.isscalar(a): @@ -2056,10 +2085,17 @@ def dpnp_matmul( """ - x1_ndim = x1.ndim - x2_ndim = x2.ndim + dpnp.check_supported_arrays_type(x1, x2) res_usm_type, exec_q = get_usm_allocations([x1, x2]) + if out is not None: + dpnp.check_supported_arrays_type(out) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + x1_ndim = x1.ndim + x2_ndim = x2.ndim if axes is not None: axes = _validate_axes(x1, x2, axes) @@ -2072,7 +2108,6 @@ def dpnp_matmul( x2 = dpnp.moveaxis(x2, axes_x2, (-2, -1)) if x2_ndim != 1 else x2 out_orig = out if out is not None: - dpnp.check_supported_arrays_type(out) # out that is passed to the backend should have the correct shape if len(axes_res) == 2: out = dpnp.moveaxis(out, axes_res, (-2, -1)) @@ -2161,8 +2196,18 @@ def dpnp_matmul( res = dpnp_dot(x1, x2, out=out) res_shape = res.shape else: + x1_contig_flag, _, x1_f = _define_contig_flag(x1) + x2_contig_flag, _, x2_f = _define_contig_flag(x2) + res_order = "F" if (x1_f and x2_f and call_flag == "gemm") else "C" res = _create_result_array( - x1, x2, out, res_shape, compute_dtype, res_usm_type, exec_q + x1, + x2, + out, + res_shape, + compute_dtype, + res_usm_type, + exec_q, + res_order, ) # calculate result @@ -2175,21 +2220,21 @@ def dpnp_matmul( # their base (last 2-dimensions) to be c-contiguous or f-contiguous dep_events_list = [] host_tasks_list = [] - contig_flag = _define_contig_flag(x1) x1 = _copy_array( x1, dep_events_list, host_tasks_list, - copy_flag=not contig_flag, + copy_flag=not x1_contig_flag, dtype=compute_dtype, + order=res_order, ) - contig_flag = _define_contig_flag(x2) x2 = _copy_array( x2, dep_events_list, host_tasks_list, - copy_flag=not contig_flag, + copy_flag=not x2_contig_flag, dtype=compute_dtype, + order=res_order, ) if call_flag == "gemv": diff --git a/tests/test_linalg.py b/tests/test_linalg.py index ec2a085d4d3..48a4891034c 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -613,12 +613,28 @@ def test_einsum_trivial_cases(self): expected = numpy.einsum("i,i,i", b_np, b_np, b_np, optimize="greedy") assert_dtype_allclose(result, expected) + def test_einsum_out(self): + a = inp.ones((5, 5)) + a_np = a.asnumpy() + out = inp.empty((5,)) + out_np = out.asnumpy() + result = inp.einsum("ii->i", a, out=out) + assert result is out + expected = numpy.einsum("ii->i", a_np, out=out_np) + assert_dtype_allclose(result, expected) + def test_einsum_error(self): a = inp.ones((5, 5)) # unknown keyword argument with pytest.raises(TypeError): inp.einsum("ii->i", a, copy=False) + a = inp.ones((5, 5)) + out = inp.empty((5,), sycl_queue=dpctl.SyclQueue()) + # inconsistent sycl_queue + with pytest.raises(ExecutionPlacementError): + inp.einsum("ii->i", a, out=out) + # unknown value for optimize keyword with pytest.raises(TypeError): inp.einsum("ii->i", a, optimize="average") diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py index 4a1ee63c8fc..6cf52e91deb 100644 --- a/tests/test_mathematical.py +++ b/tests/test_mathematical.py @@ -2,6 +2,7 @@ import dpctl.tensor as dpt import numpy import pytest +from dpctl.utils import ExecutionPlacementError from numpy.testing import ( assert_allclose, assert_almost_equal, @@ -2975,20 +2976,99 @@ def test_matmul_strided_vec_mat(self, shape, incx, incy, transpose): assert result is out assert_dtype_allclose(result, expected) + @pytest.mark.parametrize( + "order1, order2, out_order", + [ + ("C", "C", "C"), + ("C", "C", "F"), + ("C", "F", "C"), + ("C", "F", "F"), + ("F", "C", "C"), + ("F", "C", "F"), + ("F", "F", "F"), + ("F", "F", "C"), + ], + ) @pytest.mark.parametrize( "dtype", get_all_dtypes(no_none=True, no_bool=True) ) - def test_matmul_out(self, dtype): - a1 = numpy.arange(5 * 4, dtype=dtype).reshape(5, 4) - a2 = numpy.arange(7 * 4, dtype=dtype).reshape(4, 7) + def test_matmul_out1(self, order1, order2, out_order, dtype): + # test gemm with out keyword + a1 = numpy.arange(20, dtype=dtype).reshape(5, 4, order=order1) + a2 = numpy.arange(28, dtype=dtype).reshape(4, 7, order=order2) b1 = dpnp.asarray(a1) b2 = dpnp.asarray(a2) - dpnp_out = dpnp.empty((5, 7), dtype=dtype) + dpnp_out = dpnp.empty((5, 7), dtype=dtype, order=out_order) result = dpnp.matmul(b1, b2, out=dpnp_out) - expected = numpy.matmul(a1, a2) assert result is dpnp_out + + out = numpy.empty((5, 7), dtype=dtype, order=out_order) + expected = numpy.matmul(a1, a2, out=out) + assert result.flags.c_contiguous == expected.flags.c_contiguous + assert result.flags.f_contiguous == expected.flags.f_contiguous + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("trans", [True, False]) + @pytest.mark.parametrize( + "dtype", get_all_dtypes(no_none=True, no_bool=True) + ) + def test_matmul_out2(self, trans, dtype): + # test gemm_batch with out keyword + # the base of input arrays is c-contiguous + # the base of output array is c-contiguous or f-contiguous + a1 = numpy.arange(24, dtype=dtype).reshape(2, 3, 4) + a2 = numpy.arange(40, dtype=dtype).reshape(2, 4, 5) + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + if trans: + dpnp_out = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1) + out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1) + else: + dpnp_out = dpnp.empty((2, 3, 5), dtype=dtype) + out = numpy.empty((2, 3, 5), dtype=dtype) + + result = dpnp.matmul(b1, b2, out=dpnp_out) + assert result is dpnp_out + + expected = numpy.matmul(a1, a2, out=out) + assert result.flags.c_contiguous == expected.flags.c_contiguous + assert result.flags.f_contiguous == expected.flags.f_contiguous + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("trans", [True, False]) + @pytest.mark.parametrize( + "dtype", get_all_dtypes(no_none=True, no_bool=True) + ) + def test_matmul_out3(self, trans, dtype): + # test gemm_batch with out keyword + # the base of input arrays is f-contiguous + # the base of output array is c-contiguous or f-contiguous + a1 = numpy.arange(24, dtype=dtype).reshape(2, 4, 3) + a2 = numpy.arange(40, dtype=dtype).reshape(2, 5, 4) + b1 = dpnp.asarray(a1) + b2 = dpnp.asarray(a2) + + a1 = numpy.asarray(a1).transpose(0, 2, 1) + a2 = numpy.asarray(a2).transpose(0, 2, 1) + b1 = b1.transpose(0, 2, 1) + b2 = b2.transpose(0, 2, 1) + + if trans: + dpnp_out = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1) + out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1) + else: + dpnp_out = dpnp.empty((2, 3, 5), dtype=dtype) + out = numpy.empty((2, 3, 5), dtype=dtype) + + result = dpnp.matmul(b1, b2, out=dpnp_out) + assert result is dpnp_out + + expected = numpy.matmul(a1, a2, out=out) + assert result.flags.c_contiguous == expected.flags.c_contiguous + assert result.flags.f_contiguous == expected.flags.f_contiguous assert_dtype_allclose(result, expected) @pytest.mark.parametrize( @@ -3000,6 +3080,9 @@ def test_matmul_out(self, dtype): ], ) def test_matmul_out_0D(self, out_shape): + # for matmul of 0-D arrays with out keyword, + # NumPy repeats the data to match the shape + # of output array a = numpy.arange(3) b = dpnp.asarray(a) @@ -3107,10 +3190,15 @@ def test_invalid_dtype(self, dtype): def test_exe_q(self): x1 = dpnp.ones((5, 4), sycl_queue=dpctl.SyclQueue()) x2 = dpnp.ones((4, 7), sycl_queue=dpctl.SyclQueue()) - with pytest.raises(ValueError): dpnp.matmul(x1, x2) + x1 = dpnp.ones((5, 4)) + x2 = dpnp.ones((4, 7)) + out = dpnp.empty((5, 7), sycl_queue=dpctl.SyclQueue()) + with pytest.raises(ExecutionPlacementError): + dpnp.matmul(x1, x2, out=out) + def test_matmul_casting(self): a1 = dpnp.arange(2 * 4, dtype=dpnp.float32).reshape(2, 4) a2 = dpnp.arange(4 * 3).reshape(4, 3) diff --git a/tests/test_product.py b/tests/test_product.py index d9463a1546c..a15e82f6d90 100644 --- a/tests/test_product.py +++ b/tests/test_product.py @@ -1,6 +1,7 @@ import dpctl import numpy import pytest +from dpctl.utils import ExecutionPlacementError from numpy.testing import assert_raises import dpnp @@ -473,6 +474,12 @@ def test_dot_sycl_queue_error(self): with pytest.raises(ValueError): dpnp.dot(a, b) + a = dpnp.ones((5,)) + b = dpnp.ones((5,)) + out = dpnp.empty((), sycl_queue=dpctl.SyclQueue()) + with pytest.raises(ExecutionPlacementError): + dpnp.dot(a, b, out=out) + @pytest.mark.parametrize("ia", [1, dpnp.ones((), dtype=dpnp.float32)]) def test_dot_out_error_scalar(self, ia): a = ia if dpnp.isscalar(ia) else ia.asnumpy() @@ -487,6 +494,7 @@ def test_dot_out_error_scalar(self, ia): # output shape is incorrect dp_out = dpnp.empty((2,), dtype=dpnp.int32) + out = numpy.empty((2,), dtype=numpy.int32) assert_raises(ValueError, dpnp.dot, ia, ib, out=dp_out) assert_raises(ValueError, numpy.dot, a, b, out=out)