Skip to content

Commit

Permalink
Merge pull request #4118 from jngkim/sycl-allocator-merge-2207
Browse files Browse the repository at this point in the history
Add syclSolverInverter
  • Loading branch information
prckent authored Jul 19, 2022
2 parents 05e3460 + f9e3223 commit 8e5221c
Show file tree
Hide file tree
Showing 11 changed files with 424 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ if(ENABLE_SYCL)
endif()
add_library(SYCL::host INTERFACE IMPORTED)
add_library(SYCL::device INTERFACE IMPORTED)
find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE})
find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE} NO_DEFAULT_PATH)
target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host)
target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device)
if(TARGET MKL::sycl)
Expand Down
107 changes: 105 additions & 2 deletions src/Platforms/SYCL/syclBLAS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ sycl::event gemm(sycl::queue& handle,
const int ldc,
const std::vector<sycl::event>& events)
{
return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, events);
return oneapi::mkl::blas::gemm(handle, convertTransEnum(tA), convertTransEnum(tB), m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc, events);
}



template sycl::event gemm(sycl::queue& handle,
const char tA,
const char tB,
Expand Down Expand Up @@ -182,6 +182,109 @@ template sycl::event gemm(sycl::queue& handle,
const int ldc,
const std::vector<sycl::event>& events);

//transpose
template<typename T1, typename T2>
sycl::event transpose(sycl::queue& q,
const T1* restrict in,
int m,
int lda,
T2* restrict out,
int n,
int ldb,
const std::vector<sycl::event>& events)
{
constexpr size_t tile_size = 16;
const size_t m_max = ((m + tile_size - 1) / tile_size) * tile_size;
const size_t n_max = ((n + tile_size - 1) / tile_size) * tile_size;

return q.submit([&](sycl::handler& cgh) {
cgh.depends_on(events);
sycl::accessor<T2, 2, sycl::access::mode::write, sycl::access::target::local> tile(sycl::range<2>(tile_size,
tile_size + 1),
cgh);

cgh.parallel_for(sycl::nd_range<2>{{m_max, n_max}, {tile_size, tile_size}}, [=](sycl::nd_item<2> item) {
unsigned x = item.get_global_id(1);
unsigned y = item.get_global_id(0);
unsigned xth = item.get_local_id(1);
unsigned yth = item.get_local_id(0);

if (x < n && y < m)
tile[yth][xth] = in[(y)*lda + x];
item.barrier(sycl::access::fence_space::local_space);

x = item.get_group(0) * tile_size + xth;
y = item.get_group(1) * tile_size + yth;
if (x < m && y < n)
out[(y)*ldb + x] = tile[xth][yth];
});
});
}

template sycl::event transpose(sycl::queue& q,
const float* restrict in,
int m,
int lda,
double* restrict out,
int n,
int ldb,
const std::vector<sycl::event>& events);

template sycl::event transpose(sycl::queue& q,
const double* restrict in,
int m,
int lda,
double* restrict out,
int n,
int ldb,
const std::vector<sycl::event>& events);

template sycl::event transpose(sycl::queue& q,
const std::complex<float>* restrict in,
int m,
int lda,
std::complex<double>* restrict out,
int n,
int ldb,
const std::vector<sycl::event>& events);

template sycl::event transpose(sycl::queue& q,
const std::complex<double>* restrict in,
int m,
int lda,
std::complex<double>* restrict out,
int n,
int ldb,
const std::vector<sycl::event>& events);

//copy_n for mixed precision
template<typename T1, typename T2>
sycl::event copy_n(sycl::queue& aq,
const T1* restrict VA,
size_t array_size,
T2* restrict VC,
const std::vector<sycl::event>& events)
{
constexpr size_t tile_size = 64;
const size_t a_max = ((array_size + tile_size - 1) / tile_size) * tile_size;
return aq.parallel_for(sycl::range<1>{a_max}, events, [=](sycl::id<1> id) {
if (id < array_size)
VC[id] = static_cast<T2>(VA[id]);
});
}

template sycl::event copy_n(sycl::queue& aq,
const double* restrict VA,
size_t array_size,
float* restrict VC,
const std::vector<sycl::event>& events);

template sycl::event copy_n(sycl::queue& aq,
const std::complex<double>* restrict VA,
size_t array_size,
std::complex<float>* restrict VC,
const std::vector<sycl::event>& events);

} // namespace syclBLAS

} // namespace qmcplusplus
17 changes: 17 additions & 0 deletions src/Platforms/SYCL/syclBLAS.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,23 @@ sycl::event gemm(sycl::queue& handle,
const int ldc,
const std::vector<sycl::event>& events = {});

template<typename T1, typename T2>
sycl::event transpose(sycl::queue& q,
const T1* in,
int m,
int lda,
T2* out,
int n,
int ldb,
const std::vector<sycl::event>& events = {});

template<typename T1, typename T2>
sycl::event copy_n(sycl::queue& aq,
const T1* VA,
size_t array_size,
T2* VC,
const std::vector<sycl::event>& events = {});

} // namespace syclBLAS

} // namespace qmcplusplus
Expand Down
25 changes: 25 additions & 0 deletions src/Platforms/SYCL/syclSolver.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////

#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVER_H
#define QMCPLUSPLUS_SYCL_MKL_SOLVER_H

#include "oneapi/mkl/lapack.hpp"

namespace qmcplusplus
{
namespace syclSolver
{
using namespace oneapi::mkl::lapack;
}
} // namespace qmcplusplus

#endif
2 changes: 1 addition & 1 deletion src/Platforms/tests/SYCL/test_syclBLAS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ void test_gemv(const int M_b, const int N_b, const char trans)
// when trans == 'N', the actual calculation is B^T * A[M] = C[N]
//ompBLAS::gemv(handle, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1);

syclBLAS::gemv(handle, trans, M_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1)
syclBLAS::gemv(handle, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1)
.wait();

C.updateFrom();
Expand Down
14 changes: 8 additions & 6 deletions src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
#include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"
#include "DiracMatrix.h"
#include "PrefetchedRange.h"
#include "syclSolverInverter.hpp"

//#define SYCL_BLOCKING

namespace qmcplusplus
{

/** implements delayed update on Intel GPU using SYCL
* @tparam T base precision for most computation
* @tparam T_FP high precision for matrix inversion, T_FP >= T
Expand All @@ -50,7 +51,7 @@ class DelayedUpdateSYCL
/// current number of delays, increase one for each acceptance, reset to 0 after updating Ainv
int delay_count;

DiracMatrix<T_FP> host_inverter_;
syclSolverInverter<T_FP> sycl_inverter_;

// the range of prefetched_Ainv_rows
PrefetchedRange prefetched_range;
Expand Down Expand Up @@ -102,8 +103,9 @@ class DelayedUpdateSYCL
template<typename TREAL>
void invert_transpose(const Matrix<T>& logdetT, Matrix<T>& Ainv, std::complex<TREAL>& log_value)
{
host_inverter_.invert_transpose(logdetT, Ainv, log_value);
initializeInv(Ainv);
clearDelayCount();

sycl_inverter_.invert_transpose(logdetT, Ainv, Ainv_gpu, log_value, m_queue_);
}

/** initialize internal objects when Ainv is refreshed
Expand Down Expand Up @@ -215,11 +217,11 @@ class DelayedUpdateSYCL

#ifdef SYCL_BLOCKING
syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
cone, Ainv_gpu.data(), norb, {u_event, temp_v_event})
cone, Ainv_gpu.data(), norb, {u_event})
.wait();
#else
ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb,
temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event, temp_v_event});
temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event});
#endif

clearDelayCount();
Expand Down
139 changes: 139 additions & 0 deletions src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2019 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////

#ifndef QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H
#define QMCPLUSPLUS_SYCL_MKL_SOLVERINVERTOR_H

#include "OhmmsPETE/OhmmsVector.h"
#include "OhmmsPETE/OhmmsMatrix.h"
#include "SYCL/SYCLallocator.hpp"
#include "SYCL/syclBLAS.hpp"
#include "SYCL/syclSolver.hpp"
#include "QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp"

namespace qmcplusplus
{
/** implements matrix inversion via cuSolverDN
* @tparam T_FP high precision for matrix inversion, T_FP >= T
*/
template<typename T_FP>
class syclSolverInverter
{
/// scratch memory for cusolverDN
Matrix<T_FP, SYCLAllocator<T_FP>> Mat1_gpu;
/// pivot array + info
Vector<std::int64_t, SYCLAllocator<std::int64_t>> ipiv;
/// workspace
Vector<T_FP, SYCLAllocator<T_FP>> workspace;
std::int64_t getrf_ws = 0;
std::int64_t getri_ws = 0;

/** resize the internal storage
* @param norb number of electrons/orbitals
* @param delay, maximum delay 0<delay<=norb
*/
inline void resize(int norb, sycl::queue& m_queue)
{
if (Mat1_gpu.rows() != norb)
{
Mat1_gpu.resize(norb, norb);
ipiv.resize(norb);
getrf_ws = syclSolver::getrf_scratchpad_size<T_FP>(m_queue, norb, norb, norb);
getri_ws = syclSolver::getri_scratchpad_size<T_FP>(m_queue, norb, norb);
workspace.resize(std::max(getrf_ws, getri_ws));
}
}

public:
/** compute the inverse of the transpose of matrix A and its determinant value in log
* when T_FP and TMAT are the same
* @tparam TREAL real type
*/
template<typename TMAT, typename TREAL, typename = std::enable_if_t<std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
Matrix<TMAT>& Ainv,
Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
std::complex<TREAL>& log_value,
sycl::queue& m_queue)
{
const int norb = logdetT.rows();
resize(norb, m_queue);

auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
Ainv_gpu.cols(), {c_event});
try
{
syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
{t_event}).wait();
}
catch (cl::sycl::exception const& ex)
{
std::ostringstream err;
err << "\t\tCaught synchronous SYCL exception during getrf:\n"
<< ex.what() << " status: " << ex.code() << std::endl;
std::cerr << err.str();
throw std::runtime_error(err.str());
}

log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data());

c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait();
}

/** compute the inverse of the transpose of matrix A and its determinant value in log
* when T_FP and TMAT are not the same
* @tparam TREAL real type
*/
template<typename TMAT, typename TREAL, typename = std::enable_if_t<!std::is_same<TMAT, T_FP>::value>>
std::enable_if_t<!std::is_same<TMAT, T_FP>::value> invert_transpose(const Matrix<TMAT>& logdetT,
Matrix<TMAT>& Ainv,
Matrix<TMAT, SYCLAllocator<TMAT>>& Ainv_gpu,
std::complex<TREAL>& log_value,
sycl::queue& m_queue)
{
const int norb = logdetT.rows();
resize(norb, m_queue);
//use Ainv_gpu for transpose
auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
//transpose
auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
Mat1_gpu.cols(), {c_event});

//getrf (LU) -> getri (inverse)
try
{
syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws,
{t_event}).wait();
}
catch (cl::sycl::exception const& ex)
{
std::ostringstream err;
err << "\t\tCaught synchronous SYCL exception during getrf:\n"
<< ex.what() << " status: " << ex.code() << std::endl;
std::cerr << err.str();
throw std::runtime_error(err.str());
}

log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data());

c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event});

m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait();
}
};
} // namespace qmcplusplus

#endif // QMCPLUSPLUS_CUSOLVERINVERTOR_H
Loading

0 comments on commit 8e5221c

Please sign in to comment.