diff --git a/catch/hipTestMain/config/config_amd_linux_common.json b/catch/hipTestMain/config/config_amd_linux_common.json index b81d7a3d2..2ba59aac8 100644 --- a/catch/hipTestMain/config/config_amd_linux_common.json +++ b/catch/hipTestMain/config/config_amd_linux_common.json @@ -110,6 +110,11 @@ "Unit_ChannelDescriptor_Positive_Basic_2D - long2", "Unit_ChannelDescriptor_Positive_Basic_4D - ulong4", "Unit_ChannelDescriptor_Positive_Basic_4D - long4", - "Unit_hipExtModuleLaunchKernel_NonUniformWorkGroup" + "Unit_hipExtModuleLaunchKernel_NonUniformWorkGroup", + "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/18 ===", + "Unit_hipMemcpyAsync_Negative_Parameters", + "Unit_hipMemcpyDtoHAsync_Negative_Parameters", + "Unit_hipMemcpyHtoDAsync_Negative_Parameters", + "Unit_hipMemcpyDtoDAsync_Negative_Parameters" ] } diff --git a/catch/hipTestMain/config/config_amd_windows_common.json b/catch/hipTestMain/config/config_amd_windows_common.json index db470678d..80401608e 100644 --- a/catch/hipTestMain/config/config_amd_windows_common.json +++ b/catch/hipTestMain/config/config_amd_windows_common.json @@ -200,6 +200,11 @@ "Unit_ChannelDescriptor_Positive_Basic_3D - long3", "Unit_ChannelDescriptor_Positive_Basic_4D - ulong4", "Unit_ChannelDescriptor_Positive_Basic_4D - long4", - "Unit_hipExtModuleLaunchKernel_NonUniformWorkGroup" + "Unit_hipExtModuleLaunchKernel_NonUniformWorkGroup", + "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/18 ===", + "Unit_hipMemcpyAsync_Negative_Parameters", + "Unit_hipMemcpyDtoHAsync_Negative_Parameters", + "Unit_hipMemcpyHtoDAsync_Negative_Parameters", + "Unit_hipMemcpyDtoDAsync_Negative_Parameters" ] } diff --git a/catch/unit/memory/CMakeLists.txt b/catch/unit/memory/CMakeLists.txt index f09f0579f..ff9eaa4f7 100644 --- a/catch/unit/memory/CMakeLists.txt +++ b/catch/unit/memory/CMakeLists.txt @@ -83,6 +83,8 @@ set(TEST_SRC hipMemcpy_old.cc hipMemcpy_derivatives.cc hipMemcpyAsync.cc + hipMemcpyAsync_old.cc + hipMemcpyAsync_derivatives.cc hipMemsetFunctional.cc hipMalloc.cc hipMallocPitch.cc diff --git a/catch/unit/memory/hipMemcpyAsync.cc b/catch/unit/memory/hipMemcpyAsync.cc index 4c4e08ec2..cd8835640 100644 --- a/catch/unit/memory/hipMemcpyAsync.cc +++ b/catch/unit/memory/hipMemcpyAsync.cc @@ -1,13 +1,15 @@ /* -Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -17,392 +19,145 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/* -This testcase verifies the following scenarios -1. hipMemcpyAsync with kernel launch -2. H2D-D2D-D2H-H2PinnMem and device context change scenarios -3. This test launches multiple threads which uses same stream to deploy kernel - and also launch hipMemcpyAsync() api. This test case is simulate the scenario - reported in SWDEV-181598. -*/ - #include -#include -#include -#include - -#define NUM_THREADS 16 - -static constexpr auto NUM_ELM{1024 * 1024}; - - - -static constexpr size_t N_ELMTS{32 * 1024}; -std::atomic Thread_count { 0 }; -static unsigned blocksPerCU{6}; // to hide latency -static unsigned threadsPerBlock{256}; - -template -void Thread_func(T *A_d, T *B_d, T* C_d, T* C_h, size_t Nbytes, - hipStream_t mystream) { - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, N_ELMTS); - hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), - dim3(threadsPerBlock), 0, - mystream, A_d, C_d, N_ELMTS); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); - // The following two MemcpyAsync calls are for sole - // purpose of loading stream with multiple async calls - HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, - hipMemcpyDeviceToDevice, mystream)); - HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, - hipMemcpyDeviceToDevice, mystream)); - Thread_count++; +#include +#include +#include +#include + +TEST_CASE("Unit_hipMemcpyAsync_Positive_Basic") { + using namespace std::placeholders; + const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); + const StreamGuard stream_guard(stream_type); + const hipStream_t stream = stream_guard.stream(); + + MemcpyWithDirectionCommonTests(std::bind(hipMemcpyAsync, _1, _2, _3, _4, stream)); } -template -void Thread_func_MultiStream() { - int Data_mismatch = 0; - T *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - T *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; - size_t Nbytes = N_ELMTS * sizeof(T); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, N_ELMTS); - HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N_ELMTS, false); - hipStream_t mystream; - HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream)); - hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), - dim3(threadsPerBlock), 0, - mystream, A_d, C_d, N_ELMTS); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); - // The following hipMemcpyAsync() is called only to - // load stream with multiple Async calls - HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, - hipMemcpyDeviceToDevice, mystream)); - Thread_count++; - - HIP_CHECK(hipStreamSynchronize(mystream)); - HIP_CHECK(hipStreamDestroy(mystream)); - // Verifying result of the kernel computation - for (size_t i = 0; i < N_ELMTS; i++) { - if (C_h[i] != A_h[i] * A_h[i]) { - Data_mismatch++; - } +TEST_CASE("Unit_hipMemcpyAsync_Positive_Synchronization_Behavior") { + using namespace std::placeholders; + HIP_CHECK(hipDeviceSynchronize()); + + SECTION("Host memory to device memory") { + // This behavior differs on NVIDIA and AMD, on AMD the hipMemcpy calls is synchronous with + // respect to the host +#if HT_AMD + HipTest::HIP_SKIP_TEST( + "EXSWCPHIPT-127 - MemcpyAsync from host to device memory behavior differs on AMD and " + "Nvidia"); + return; +#endif + MemcpyHtoDSyncBehavior(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyHostToDevice, nullptr), + false); } - // Releasing resources - HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); - REQUIRE(Data_mismatch == 0); -} -/* -This testcase verifies hipMemcpyAsync API -Initializes device variables -Launches kernel and performs the sum of device variables -copies the result to host variable and validates the result. -*/ -TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_KernelLaunch", "", int, float, - double) { - size_t Nbytes = NUM_ELM * sizeof(TestType); - - TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; - HIP_CHECK(hipSetDevice(0)); - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, false); - - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - - hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0, - static_cast(A_d), - static_cast(B_d), C_d, NUM_ELM); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HIP_CHECK(hipStreamDestroy(stream)); - - HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); - - HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); -} -/* -This testcase verifies the following scenarios -1. H2H,H2PinMem and PinnedMem2Host -2. H2D-D2D-D2H in same GPU -3. Pinned Host Memory to device variables in same GPU -4. Device context change -5. H2D-D2D-D2H peer GPU -*/ -TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_H2H-H2D-D2H-H2PinMem", "", char, int, - float, double) { - TestType *A_d{nullptr}, *B_d{nullptr}; - TestType *A_h{nullptr}, *B_h{nullptr}; - TestType *A_Ph{nullptr}, *B_Ph{nullptr}; - HIP_CHECK(hipSetDevice(0)); - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - HipTest::initArrays(&A_d, &B_d, nullptr, - &A_h, &B_h, nullptr, - NUM_ELM*sizeof(TestType)); - HipTest::initArrays(nullptr, nullptr, nullptr, - &A_Ph, &B_Ph, nullptr, - NUM_ELM*sizeof(TestType), true); - - SECTION("H2H, H2PinMem and PinMem2H") { - HIP_CHECK(hipMemcpyAsync(B_h, A_h, NUM_ELM*sizeof(TestType), - hipMemcpyHostToHost, stream)); - HIP_CHECK(hipMemcpyAsync(A_Ph, B_h, NUM_ELM*sizeof(TestType), - hipMemcpyHostToHost, stream)); - HIP_CHECK(hipMemcpyAsync(B_Ph, A_Ph, NUM_ELM*sizeof(TestType), - hipMemcpyHostToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HipTest::checkTest(A_h, B_Ph, NUM_ELM); + SECTION("Device memory to pageable host memory") { + MemcpyDtoHPageableSyncBehavior( + std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyDeviceToHost, nullptr), true); } - SECTION("H2D-D2D-D2H-SameGPU") { - HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), - hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_h, B_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HipTest::checkTest(A_h, B_h, NUM_ELM); + SECTION("Device memory to pinned host memory") { + MemcpyDtoHPinnedSyncBehavior( + std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyDeviceToHost, nullptr), false); } - SECTION("pH2D-D2D-D2pH-SameGPU") { - HIP_CHECK(hipMemcpyAsync(A_d, A_Ph, NUM_ELM*sizeof(TestType), - hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_Ph, B_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HipTest::checkTest(A_Ph, B_Ph, NUM_ELM); + SECTION("Device memory to device memory") { + MemcpyDtoDSyncBehavior(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyDeviceToDevice, nullptr), + false); } - SECTION("H2D-D2D-D2H-DeviceContextChange") { - int deviceCount = 0; - HIP_CHECK(hipGetDeviceCount(&deviceCount)); - if (deviceCount < 2) { - SUCCEED("deviceCount less then 2"); - } else { - int canAccessPeer = 0; - HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1)); - if (canAccessPeer) { - HIP_CHECK(hipSetDevice(1)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), - hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_h, B_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HipTest::checkTest(A_h, B_h, NUM_ELM); - } else { - SUCCEED("P2P capability is not present"); - } - } + SECTION("Host memory to host memory") { + MemcpyHtoHSyncBehavior(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyHostToHost, nullptr), + true); } - - SECTION("H2D-D2D-D2H-PeerGPU") { - int deviceCount = 0; - HIP_CHECK(hipGetDeviceCount(&deviceCount)); - if (deviceCount < 2) { - SUCCEED("deviceCount less then 2"); - } else { - int canAccessPeer = 0; - HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1)); - if (canAccessPeer) { - HIP_CHECK(hipSetDevice(1)); - TestType *C_d{nullptr}; - HipTest::initArrays(nullptr, nullptr, &C_d, - nullptr, nullptr, nullptr, - NUM_ELM*sizeof(TestType)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), - hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(C_d, A_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(B_h, C_d, NUM_ELM*sizeof(TestType), - hipMemcpyDeviceToHost, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - HipTest::checkTest(A_h, B_h, NUM_ELM); - HIP_CHECK(hipFree(C_d)); - - } else { - SUCCEED("P2P capability is not present"); - } - } - } - - HIP_CHECK(hipStreamDestroy(stream)); - - HipTest::freeArrays(A_d, B_d, nullptr, A_h, B_h, nullptr, false); - HipTest::freeArrays(nullptr, nullptr, nullptr, A_Ph, - B_Ph, nullptr, true); } -// This test launches multiple threads which uses same stream to deploy kernel -// and also launch hipMemcpyAsync() api. This test case is simulate the scenario -// reported in SWDEV-181598 - -TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_hipMultiMemcpyMultiThread", "", - int, float, double) { - size_t Nbytes = N_ELMTS * sizeof(TestType); +TEST_CASE("Unit_hipMemcpyAsync_Negative_Parameters") { + using namespace std::placeholders; + constexpr auto InvalidStream = [] { + StreamGuard sg(Streams::created); + return sg.stream(); + }; - int Data_mismatch = 0; - hipStream_t mystream; - TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; + SECTION("Host to device") { + LinearAllocGuard device_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard host_alloc(LinearAllocs::hipHostMalloc, kPageSize); - HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N_ELMTS, false); + MemcpyCommonNegativeTests(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyHostToDevice, nullptr), + device_alloc.ptr(), host_alloc.ptr(), kPageSize); - HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream)); + SECTION("Invalid MemcpyKind") { + HIP_CHECK_ERROR(hipMemcpyAsync(device_alloc.ptr(), host_alloc.ptr(), kPageSize, + static_cast(-1), nullptr), + hipErrorInvalidMemcpyDirection); + } - std::thread T[NUM_THREADS]; - for (int i = 0; i < NUM_THREADS; i++) { - T[i] = std::thread(Thread_func, A_d, B_d, C_d, - C_h, Nbytes, mystream); + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyAsync(device_alloc.ptr(), host_alloc.ptr(), kPageSize, + hipMemcpyHostToDevice, InvalidStream()), + hipErrorContextIsDestroyed); + } } - // Wait until all the threads finish their execution - for (int i = 0; i < NUM_THREADS; i++) { - T[i].join(); - } + SECTION("Device to host") { + LinearAllocGuard device_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard host_alloc(LinearAllocs::hipHostMalloc, kPageSize); - HIP_CHECK(hipStreamSynchronize(mystream)); - HIP_CHECK(hipStreamDestroy(mystream)); + MemcpyCommonNegativeTests(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyDeviceToHost, nullptr), + host_alloc.ptr(), device_alloc.ptr(), kPageSize); - // Verifying the result of the kernel computation - for (size_t i = 0; i < N_ELMTS; i++) { - if (C_h[i] != A_h[i] * A_h[i]) { - Data_mismatch++; + SECTION("Invalid MemcpyKind") { + HIP_CHECK_ERROR(hipMemcpyAsync(host_alloc.ptr(), device_alloc.ptr(), kPageSize, + static_cast(-1), nullptr), + hipErrorInvalidMemcpyDirection); } - } - REQUIRE(Thread_count.load() == NUM_THREADS); - REQUIRE(Data_mismatch == 0); - HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); - Thread_count.exchange(0); -} - -TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_hipMultiMemcpyMultiThreadMultiStream", - "", int, float, double) { - std::thread T[NUM_THREADS]; - for (int i = 0; i < NUM_THREADS; i++) { - T[i] = std::thread(Thread_func_MultiStream); - } - // Wait until all the threads finish their execution - for (int i = 0; i < NUM_THREADS; i++) { - T[i].join(); + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyAsync(host_alloc.ptr(), device_alloc.ptr(), kPageSize, + hipMemcpyDeviceToHost, InvalidStream()), + hipErrorContextIsDestroyed); + } } - REQUIRE(Thread_count.load() == NUM_THREADS); - Thread_count.exchange(0); -} + SECTION("Host to host") { + LinearAllocGuard src_alloc(LinearAllocs::hipHostMalloc, kPageSize); + LinearAllocGuard dst_alloc(LinearAllocs::hipHostMalloc, kPageSize); -/* -This testcase verifies hipMemcpy API with pinnedMemory and hostRegister -along with kernel launches -*/ - -TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_PinnedRegMemWithKernelLaunch", - "", int, float, double) { - int numDevices = 0; - HIP_CHECK(hipGetDeviceCount(&numDevices)); - if (numDevices < 2) { - SUCCEED("No of devices are less than 2"); - } else { - // 1 refers to pinned Memory - // 2 refers to register Memory - int MallocPinType = GENERATE(0, 1); - size_t Nbytes = NUM_ELM * sizeof(TestType); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, NUM_ELM); + MemcpyCommonNegativeTests(std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyHostToHost, nullptr), + dst_alloc.ptr(), src_alloc.ptr(), kPageSize); - TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - TestType *X_d{nullptr}, *Y_d{nullptr}, *Z_d{nullptr}; - TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; - if (MallocPinType) { - HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, true); - } else { - A_h = reinterpret_cast(malloc(Nbytes)); - HIP_CHECK(hipHostRegister(A_h, Nbytes, hipHostRegisterDefault)); - B_h = reinterpret_cast(malloc(Nbytes)); - HIP_CHECK(hipHostRegister(B_h, Nbytes, hipHostRegisterDefault)); - C_h = reinterpret_cast(malloc(Nbytes)); - HIP_CHECK(hipHostRegister(C_h, Nbytes, hipHostRegisterDefault)); - HipTest::initArrays(&A_d, &B_d, &C_d, nullptr, nullptr, - nullptr, NUM_ELM, false); - HipTest::setDefaultData(NUM_ELM, A_h, B_h, C_h); + SECTION("Invalid MemcpyKind") { + HIP_CHECK_ERROR(hipMemcpyAsync(dst_alloc.ptr(), src_alloc.ptr(), kPageSize, + static_cast(-1), nullptr), + hipErrorInvalidMemcpyDirection); } - HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), - 0, 0, static_cast(A_d), - static_cast(B_d), C_d, NUM_ELM); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); - HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); - unsigned int seed = time(0); - HIP_CHECK(hipSetDevice(HipTest::RAND_R(&seed) % (numDevices-1)+1)); - - int device; - HIP_CHECK(hipGetDevice(&device)); - INFO("hipMemcpy is set to happen between device 0 and device " - << device); - HipTest::initArrays(&X_d, &Y_d, &Z_d, nullptr, - nullptr, nullptr, NUM_ELM, false); - - hipStream_t gpu1Stream; - HIP_CHECK(hipStreamCreate(&gpu1Stream)); - - for (int j = 0; j < NUM_ELM; j++) { - A_h[j] = 0; - B_h[j] = 0; - C_h[j] = 0; + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyAsync(dst_alloc.ptr(), src_alloc.ptr(), kPageSize, + hipMemcpyHostToHost, InvalidStream()), + hipErrorContextIsDestroyed); } + } - HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - HIP_CHECK(hipMemcpyAsync(X_d, A_h, Nbytes, hipMemcpyHostToDevice, gpu1Stream)); - HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost)); - HIP_CHECK(hipMemcpyAsync(Y_d, B_h, Nbytes, hipMemcpyHostToDevice, gpu1Stream)); + SECTION("Device to device") { + LinearAllocGuard src_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard dst_alloc(LinearAllocs::hipMalloc, kPageSize); - hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), - 0, 0, static_cast(X_d), - static_cast(Y_d), Z_d, NUM_ELM); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpyAsync(C_h, Z_d, Nbytes, - hipMemcpyDeviceToHost, gpu1Stream)); - HIP_CHECK(hipStreamSynchronize(gpu1Stream)); + MemcpyCommonNegativeTests( + std::bind(hipMemcpyAsync, _1, _2, _3, hipMemcpyDeviceToDevice, nullptr), dst_alloc.ptr(), + src_alloc.ptr(), kPageSize); - HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); + SECTION("Invalid MemcpyKind") { + HIP_CHECK_ERROR(hipMemcpyAsync(src_alloc.ptr(), dst_alloc.ptr(), kPageSize, + static_cast(-1), nullptr), + hipErrorInvalidMemcpyDirection); + } - if (MallocPinType) { - HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); - } else { - HIP_CHECK(hipHostUnregister(A_h)); - free(A_h); - HIP_CHECK(hipHostUnregister(B_h)); - free(B_h); - HIP_CHECK(hipHostUnregister(C_h)); - free(C_h); - HipTest::freeArrays(A_d, B_d, C_d, nullptr, - nullptr, nullptr, false); + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyAsync(dst_alloc.ptr(), src_alloc.ptr(), kPageSize, + hipMemcpyDeviceToDevice, InvalidStream()), + hipErrorContextIsDestroyed); } - HipTest::freeArrays(X_d, Y_d, Z_d, nullptr, - nullptr, nullptr, false); - HIP_CHECK(hipStreamDestroy(gpu1Stream)); } } - diff --git a/catch/unit/memory/hipMemcpyAsync_derivatives.cc b/catch/unit/memory/hipMemcpyAsync_derivatives.cc new file mode 100644 index 000000000..251c630d5 --- /dev/null +++ b/catch/unit/memory/hipMemcpyAsync_derivatives.cc @@ -0,0 +1,175 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +static hipStream_t InvalidStream() { + StreamGuard sg(Streams::created); + return sg.stream(); +} + +TEST_CASE("Unit_hipMemcpyDtoHAsync_Positive_Basic") { + const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); + const StreamGuard stream_guard(stream_type); + + const auto f = [stream = stream_guard.stream()](void* dst, void* src, size_t count) { + return hipMemcpyDtoHAsync(dst, reinterpret_cast(src), count, stream); + }; + MemcpyDeviceToHostShell(f, stream_guard.stream()); +} + +TEST_CASE("Unit_hipMemcpyDtoHAsync_Positive_Synchronization_Behavior") { + HIP_CHECK(hipDeviceSynchronize()); + + SECTION("Device memory to pageable host memory") { + MemcpyDtoHPageableSyncBehavior( + [](void* dst, void* src, size_t count) { + return hipMemcpyDtoHAsync(dst, reinterpret_cast(src), count, nullptr); + }, + true); + } + + SECTION("Device memory to pinned host memory") { + MemcpyDtoHPinnedSyncBehavior( + [](void* dst, void* src, size_t count) { + return hipMemcpyDtoHAsync(dst, reinterpret_cast(src), count, nullptr); + }, + false); + } +} + +TEST_CASE("Unit_hipMemcpyDtoHAsync_Negative_Parameters") { + using namespace std::placeholders; + LinearAllocGuard device_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard host_alloc(LinearAllocs::hipHostMalloc, kPageSize); + + MemcpyCommonNegativeTests( + [](void* dst, void* src, size_t count) { + return hipMemcpyDtoHAsync(dst, reinterpret_cast(src), count, nullptr); + }, + host_alloc.ptr(), device_alloc.ptr(), kPageSize); + + SECTION("Invalid stream") { + HIP_CHECK_ERROR( + hipMemcpyDtoHAsync(host_alloc.ptr(), reinterpret_cast(device_alloc.ptr()), + kPageSize, InvalidStream()), + hipErrorContextIsDestroyed); + } +} + +TEST_CASE("Unit_hipMemcpyHtoDAsync_Positive_Basic") { + const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); + const StreamGuard stream_guard(stream_type); + + const auto f = [stream = stream_guard.stream()](void* dst, void* src, size_t count) { + return hipMemcpyHtoDAsync(reinterpret_cast(dst), src, count, stream); + }; + MemcpyHostToDeviceShell(f, stream_guard.stream()); +} + +TEST_CASE("Unit_hipMemcpyHtoDAsync_Positive_Synchronization_Behavior") { + // This behavior differs on NVIDIA and AMD, on AMD the hipMemcpy calls is synchronous with + // respect to the host +#if HT_AMD + HipTest::HIP_SKIP_TEST( + "EXSWCPHIPT-127 - MemcpyAsync from host to device memory behavior differs on AMD and " + "Nvidia"); + return; +#endif + MemcpyHtoDSyncBehavior( + [](void* dst, void* src, size_t count) { + return hipMemcpyHtoDAsync(reinterpret_cast(dst), src, count, nullptr); + }, + false); +} + +TEST_CASE("Unit_hipMemcpyHtoDAsync_Negative_Parameters") { + using namespace std::placeholders; + LinearAllocGuard device_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard host_alloc(LinearAllocs::hipHostMalloc, kPageSize); + + MemcpyCommonNegativeTests( + [](void* dst, void* src, size_t count) { + return hipMemcpyHtoDAsync(reinterpret_cast(dst), src, count, nullptr); + }, + device_alloc.ptr(), host_alloc.ptr(), kPageSize); + + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyHtoDAsync(reinterpret_cast(device_alloc.ptr()), + host_alloc.ptr(), kPageSize, InvalidStream()), + hipErrorContextIsDestroyed); + } +} + +TEST_CASE("Unit_hipMemcpyDtoDAsync_Positive_Basic") { + const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); + const StreamGuard stream_guard(stream_type); + + SECTION("Device to device") { + SECTION("Peer access enabled") { + MemcpyDeviceToDeviceShell( + [stream = stream_guard.stream()](void* dst, void* src, size_t count) { + return hipMemcpyDtoDAsync(reinterpret_cast(dst), + reinterpret_cast(src), count, stream); + }); + } + SECTION("Peer access disabled") { + MemcpyDeviceToDeviceShell( + [stream = stream_guard.stream()](void* dst, void* src, size_t count) { + return hipMemcpyDtoDAsync(reinterpret_cast(dst), + reinterpret_cast(src), count, stream); + }); + } + } +} + +TEST_CASE("Unit_hipMemcpyDtoDAsync_Positive_Synchronization_Behavior") { + MemcpyDtoDSyncBehavior( + [](void* dst, void* src, size_t count) { + return hipMemcpyDtoDAsync(reinterpret_cast(dst), + reinterpret_cast(src), count, nullptr); + }, + false); +} + +TEST_CASE("Unit_hipMemcpyDtoDAsync_Negative_Parameters") { + using namespace std::placeholders; + LinearAllocGuard src_alloc(LinearAllocs::hipMalloc, kPageSize); + LinearAllocGuard dst_alloc(LinearAllocs::hipMalloc, kPageSize); + + MemcpyCommonNegativeTests( + [](void* dst, void* src, size_t count) { + return hipMemcpyDtoDAsync(reinterpret_cast(dst), + reinterpret_cast(src), count, nullptr); + }, + dst_alloc.ptr(), src_alloc.ptr(), kPageSize); + + SECTION("Invalid stream") { + HIP_CHECK_ERROR(hipMemcpyDtoDAsync(reinterpret_cast(dst_alloc.ptr()), + reinterpret_cast(src_alloc.ptr()), kPageSize, + InvalidStream()), + hipErrorContextIsDestroyed); + } +} \ No newline at end of file diff --git a/catch/unit/memory/hipMemcpyAsync_old.cc b/catch/unit/memory/hipMemcpyAsync_old.cc new file mode 100644 index 000000000..4c4e08ec2 --- /dev/null +++ b/catch/unit/memory/hipMemcpyAsync_old.cc @@ -0,0 +1,408 @@ +/* +Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +This testcase verifies the following scenarios +1. hipMemcpyAsync with kernel launch +2. H2D-D2D-D2H-H2PinnMem and device context change scenarios +3. This test launches multiple threads which uses same stream to deploy kernel + and also launch hipMemcpyAsync() api. This test case is simulate the scenario + reported in SWDEV-181598. +*/ + +#include +#include +#include +#include + +#define NUM_THREADS 16 + +static constexpr auto NUM_ELM{1024 * 1024}; + + + +static constexpr size_t N_ELMTS{32 * 1024}; +std::atomic Thread_count { 0 }; +static unsigned blocksPerCU{6}; // to hide latency +static unsigned threadsPerBlock{256}; + +template +void Thread_func(T *A_d, T *B_d, T* C_d, T* C_h, size_t Nbytes, + hipStream_t mystream) { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, N_ELMTS); + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, + mystream, A_d, C_d, N_ELMTS); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); + // The following two MemcpyAsync calls are for sole + // purpose of loading stream with multiple async calls + HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, + hipMemcpyDeviceToDevice, mystream)); + HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, + hipMemcpyDeviceToDevice, mystream)); + Thread_count++; +} + +template +void Thread_func_MultiStream() { + int Data_mismatch = 0; + T *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + T *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; + size_t Nbytes = N_ELMTS * sizeof(T); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, N_ELMTS); + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N_ELMTS, false); + hipStream_t mystream; + HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream)); + hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), + dim3(threadsPerBlock), 0, + mystream, A_d, C_d, N_ELMTS); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); + // The following hipMemcpyAsync() is called only to + // load stream with multiple Async calls + HIP_CHECK(hipMemcpyAsync(B_d, A_d, Nbytes, + hipMemcpyDeviceToDevice, mystream)); + Thread_count++; + + HIP_CHECK(hipStreamSynchronize(mystream)); + HIP_CHECK(hipStreamDestroy(mystream)); + // Verifying result of the kernel computation + for (size_t i = 0; i < N_ELMTS; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + Data_mismatch++; + } + } + // Releasing resources + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + REQUIRE(Data_mismatch == 0); +} + +/* +This testcase verifies hipMemcpyAsync API +Initializes device variables +Launches kernel and performs the sum of device variables +copies the result to host variable and validates the result. +*/ +TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_KernelLaunch", "", int, float, + double) { + size_t Nbytes = NUM_ELM * sizeof(TestType); + + TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; + HIP_CHECK(hipSetDevice(0)); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, false); + + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0, + static_cast(A_d), + static_cast(B_d), C_d, NUM_ELM); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipStreamDestroy(stream)); + + HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); +} +/* +This testcase verifies the following scenarios +1. H2H,H2PinMem and PinnedMem2Host +2. H2D-D2D-D2H in same GPU +3. Pinned Host Memory to device variables in same GPU +4. Device context change +5. H2D-D2D-D2H peer GPU +*/ +TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_H2H-H2D-D2H-H2PinMem", "", char, int, + float, double) { + TestType *A_d{nullptr}, *B_d{nullptr}; + TestType *A_h{nullptr}, *B_h{nullptr}; + TestType *A_Ph{nullptr}, *B_Ph{nullptr}; + HIP_CHECK(hipSetDevice(0)); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HipTest::initArrays(&A_d, &B_d, nullptr, + &A_h, &B_h, nullptr, + NUM_ELM*sizeof(TestType)); + HipTest::initArrays(nullptr, nullptr, nullptr, + &A_Ph, &B_Ph, nullptr, + NUM_ELM*sizeof(TestType), true); + + SECTION("H2H, H2PinMem and PinMem2H") { + HIP_CHECK(hipMemcpyAsync(B_h, A_h, NUM_ELM*sizeof(TestType), + hipMemcpyHostToHost, stream)); + HIP_CHECK(hipMemcpyAsync(A_Ph, B_h, NUM_ELM*sizeof(TestType), + hipMemcpyHostToHost, stream)); + HIP_CHECK(hipMemcpyAsync(B_Ph, A_Ph, NUM_ELM*sizeof(TestType), + hipMemcpyHostToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HipTest::checkTest(A_h, B_Ph, NUM_ELM); + } + + SECTION("H2D-D2D-D2H-SameGPU") { + HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), + hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_h, B_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HipTest::checkTest(A_h, B_h, NUM_ELM); + } + + SECTION("pH2D-D2D-D2pH-SameGPU") { + HIP_CHECK(hipMemcpyAsync(A_d, A_Ph, NUM_ELM*sizeof(TestType), + hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_Ph, B_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HipTest::checkTest(A_Ph, B_Ph, NUM_ELM); + } + SECTION("H2D-D2D-D2H-DeviceContextChange") { + int deviceCount = 0; + HIP_CHECK(hipGetDeviceCount(&deviceCount)); + if (deviceCount < 2) { + SUCCEED("deviceCount less then 2"); + } else { + int canAccessPeer = 0; + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1)); + if (canAccessPeer) { + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), + hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_d, A_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_h, B_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HipTest::checkTest(A_h, B_h, NUM_ELM); + + } else { + SUCCEED("P2P capability is not present"); + } + } + } + + SECTION("H2D-D2D-D2H-PeerGPU") { + int deviceCount = 0; + HIP_CHECK(hipGetDeviceCount(&deviceCount)); + if (deviceCount < 2) { + SUCCEED("deviceCount less then 2"); + } else { + int canAccessPeer = 0; + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1)); + if (canAccessPeer) { + HIP_CHECK(hipSetDevice(1)); + TestType *C_d{nullptr}; + HipTest::initArrays(nullptr, nullptr, &C_d, + nullptr, nullptr, nullptr, + NUM_ELM*sizeof(TestType)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, NUM_ELM*sizeof(TestType), + hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(C_d, A_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(B_h, C_d, NUM_ELM*sizeof(TestType), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HipTest::checkTest(A_h, B_h, NUM_ELM); + HIP_CHECK(hipFree(C_d)); + + } else { + SUCCEED("P2P capability is not present"); + } + } + } + + HIP_CHECK(hipStreamDestroy(stream)); + + HipTest::freeArrays(A_d, B_d, nullptr, A_h, B_h, nullptr, false); + HipTest::freeArrays(nullptr, nullptr, nullptr, A_Ph, + B_Ph, nullptr, true); +} + +// This test launches multiple threads which uses same stream to deploy kernel +// and also launch hipMemcpyAsync() api. This test case is simulate the scenario +// reported in SWDEV-181598 + +TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_hipMultiMemcpyMultiThread", "", + int, float, double) { + size_t Nbytes = N_ELMTS * sizeof(TestType); + + int Data_mismatch = 0; + hipStream_t mystream; + TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N_ELMTS, false); + + HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream)); + + std::thread T[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + T[i] = std::thread(Thread_func, A_d, B_d, C_d, + C_h, Nbytes, mystream); + } + + // Wait until all the threads finish their execution + for (int i = 0; i < NUM_THREADS; i++) { + T[i].join(); + } + + HIP_CHECK(hipStreamSynchronize(mystream)); + HIP_CHECK(hipStreamDestroy(mystream)); + + // Verifying the result of the kernel computation + for (size_t i = 0; i < N_ELMTS; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + Data_mismatch++; + } + } + REQUIRE(Thread_count.load() == NUM_THREADS); + REQUIRE(Data_mismatch == 0); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + Thread_count.exchange(0); +} + +TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_hipMultiMemcpyMultiThreadMultiStream", + "", int, float, double) { + std::thread T[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + T[i] = std::thread(Thread_func_MultiStream); + } + + // Wait until all the threads finish their execution + for (int i = 0; i < NUM_THREADS; i++) { + T[i].join(); + } + + REQUIRE(Thread_count.load() == NUM_THREADS); + Thread_count.exchange(0); +} + +/* +This testcase verifies hipMemcpy API with pinnedMemory and hostRegister +along with kernel launches +*/ + +TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_PinnedRegMemWithKernelLaunch", + "", int, float, double) { + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + if (numDevices < 2) { + SUCCEED("No of devices are less than 2"); + } else { + // 1 refers to pinned Memory + // 2 refers to register Memory + int MallocPinType = GENERATE(0, 1); + size_t Nbytes = NUM_ELM * sizeof(TestType); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, NUM_ELM); + + TestType *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + TestType *X_d{nullptr}, *Y_d{nullptr}, *Z_d{nullptr}; + TestType *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}; + if (MallocPinType) { + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, NUM_ELM, true); + } else { + A_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(hipHostRegister(A_h, Nbytes, hipHostRegisterDefault)); + B_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(hipHostRegister(B_h, Nbytes, hipHostRegisterDefault)); + C_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(hipHostRegister(C_h, Nbytes, hipHostRegisterDefault)); + HipTest::initArrays(&A_d, &B_d, &C_d, nullptr, nullptr, + nullptr, NUM_ELM, false); + HipTest::setDefaultData(NUM_ELM, A_h, B_h, C_h); + } + HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), + 0, 0, static_cast(A_d), + static_cast(B_d), C_d, NUM_ELM); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); + + unsigned int seed = time(0); + HIP_CHECK(hipSetDevice(HipTest::RAND_R(&seed) % (numDevices-1)+1)); + + int device; + HIP_CHECK(hipGetDevice(&device)); + INFO("hipMemcpy is set to happen between device 0 and device " + << device); + HipTest::initArrays(&X_d, &Y_d, &Z_d, nullptr, + nullptr, nullptr, NUM_ELM, false); + + hipStream_t gpu1Stream; + HIP_CHECK(hipStreamCreate(&gpu1Stream)); + + for (int j = 0; j < NUM_ELM; j++) { + A_h[j] = 0; + B_h[j] = 0; + C_h[j] = 0; + } + + HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpyAsync(X_d, A_h, Nbytes, hipMemcpyHostToDevice, gpu1Stream)); + HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpyAsync(Y_d, B_h, Nbytes, hipMemcpyHostToDevice, gpu1Stream)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), + 0, 0, static_cast(X_d), + static_cast(Y_d), Z_d, NUM_ELM); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpyAsync(C_h, Z_d, Nbytes, + hipMemcpyDeviceToHost, gpu1Stream)); + HIP_CHECK(hipStreamSynchronize(gpu1Stream)); + + HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM); + + if (MallocPinType) { + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); + } else { + HIP_CHECK(hipHostUnregister(A_h)); + free(A_h); + HIP_CHECK(hipHostUnregister(B_h)); + free(B_h); + HIP_CHECK(hipHostUnregister(C_h)); + free(C_h); + HipTest::freeArrays(A_d, B_d, C_d, nullptr, + nullptr, nullptr, false); + } + HipTest::freeArrays(X_d, Y_d, Z_d, nullptr, + nullptr, nullptr, false); + HIP_CHECK(hipStreamDestroy(gpu1Stream)); + } +} +