Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Lang] Support GPU solve with analyzePattern and factorize #6158

Merged
merged 9 commits into from
Sep 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions misc/test_coo_cusolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,15 @@ def print_x(x: ti.types.ndarray(), ncols: ti.i32):
print(
f"The solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}"
)

solver = ti.linalg.SparseSolver()
solver.analyze_pattern(A_ti)
solver.factorize(A_ti)
solver.solve_rf(A_ti, b, x_ti)

ti.sync()
print_x(x_ti, ncols)
ti.sync()
print(
f"The cusolver rf solution and numpy solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}"
)
10 changes: 10 additions & 0 deletions python/taichi/linalg/sparse_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ def solve_cu(self, sparse_matrix, b, x):
f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now."
)

def solve_rf(self, sparse_matrix, b, x):
if isinstance(sparse_matrix, SparseMatrix) and isinstance(
b, Ndarray) and isinstance(x, Ndarray):
self.solver.solve_rf(get_runtime().prog, sparse_matrix.matrix,
b.arr, x.arr)
else:
raise TaichiRuntimeError(
f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now."
)

def info(self):
"""Check if the linear systems are solved successfully.

Expand Down
4 changes: 4 additions & 0 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
CUDADriver::get_instance().mem_free(d_values_sorted);
CUDADriver::get_instance().mem_free(d_permutation);
CUDADriver::get_instance().mem_free(dbuffer);
csr_row_ptr_ = csr_row_offset_ptr;
csr_col_ind_ = coo_col_ptr;
csr_val_ = coo_values_ptr;
nnz_ = nnz;
#endif
}

Expand Down
17 changes: 17 additions & 0 deletions taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,25 @@ class CuSparseMatrix : public SparseMatrix {

const std::string to_string() const override;

void *get_row_ptr() const {
return csr_row_ptr_;
}
void *get_col_ind() const {
return csr_col_ind_;
}
void *get_val_ptr() const {
return csr_val_;
}
int get_nnz() const {
return nnz_;
}

private:
cusparseSpMatDescr_t matrix_;
void *csr_row_ptr_{nullptr};
void *csr_col_ind_{nullptr};
void *csr_val_{nullptr};
int nnz_{0};
};

std::unique_ptr<SparseMatrix> make_sparse_matrix(
Expand Down
110 changes: 100 additions & 10 deletions taichi/program/sparse_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,73 @@ CuSparseSolver::CuSparseSolver() {
}
#endif
}
// Reference:
// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp
void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();

CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_);
CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_);
CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_);
CUSPARSEDriver::get_instance().cpSetMatType(descr_,
CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_,
CUSPARSE_INDEX_BASE_ZERO);

// step 1: create opaque info structure
CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_);

// step 2: analyze chol(A) to know structure of L
CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis(
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_);

#else
TI_NOT_IMPLEMENTED
#endif
}

void CuSparseSolver::factorize(const SparseMatrix &sm) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();

size_t size_internal = 0;
size_t size_chol = 0; // size of working space for csrlu
// step 1: workspace for chol(A)
CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, &size_internal, &size_chol);

if (size_chol > 0)
CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol);

// step 2: compute A = L*L^T
CUSOLVERDriver::get_instance().csSpScsrcholFactor(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, gpu_buffer_);
// step 3: check if the matrix is singular
const double tol = 1.e-14;
int singularity = 0;
CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_,
tol, &singularity);
TI_ASSERT(singularity == -1);
#else
TI_NOT_IMPLEMENTED
#endif
}

void CuSparseSolver::solve_cu(Program *prog,
const SparseMatrix &sm,
Expand All @@ -100,16 +167,15 @@ void CuSparseSolver::solve_cu(Program *prog,
printf("Cusolver version: %d.%d.%d\n", major_version, minor_version,
patch_level);

const cusparseSpMatDescr_t *A =
(const cusparseSpMatDescr_t *)(sm.get_matrix());
size_t nrows = 0, ncols = 0, nnz = 0;
void *drow_offsets = NULL, *dcol_indices = NULL, *dvalues = NULL;
cusparseIndexType_t csrRowOffsetsType, csrColIndType;
cusparseIndexBase_t idxBase;
cudaDataType valueType;
CUSPARSEDriver::get_instance().cpCsrGet(
*A, &nrows, &ncols, &nnz, &drow_offsets, &dcol_indices, &dvalues,
&csrRowOffsetsType, &csrColIndType, &idxBase, &valueType);
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t nrows = A->num_rows();
size_t ncols = A->num_cols();
size_t nnz = A->get_nnz();
void *drow_offsets = A->get_row_ptr();
void *dcol_indices = A->get_col_ind();
void *dvalues = A->get_val_ptr();

size_t db = prog->get_ndarray_data_ptr_as_int(&b);
size_t dx = prog->get_ndarray_data_ptr_as_int(&x);
Expand Down Expand Up @@ -249,6 +315,30 @@ void CuSparseSolver::solve_cu(Program *prog,
#endif
}

void CuSparseSolver::solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) {
#if defined(TI_WITH_CUDA)
// Retrive the info of the sparse matrix
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t d_b = prog->get_ndarray_data_ptr_as_int(&b);
size_t d_x = prog->get_ndarray_data_ptr_as_int(&x);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_);

// TODO: free allocated memory and handles
strongoier marked this conversation as resolved.
Show resolved Hide resolved
// CUDADriver::get_instance().mem_free(gpu_buffer_);
// CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
// CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
// CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
#else
TI_NOT_IMPLEMENTED
#endif
}

std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt,
const std::string &solver_type,
const std::string &ordering) {
Expand Down
34 changes: 27 additions & 7 deletions taichi/program/sparse_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class SparseSolver {
virtual void analyze_pattern(const SparseMatrix &sm) = 0;
virtual void factorize(const SparseMatrix &sm) = 0;
virtual Eigen::VectorXf solve(const Eigen::Ref<const Eigen::VectorXf> &b) = 0;
virtual void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) = 0;
virtual void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Expand All @@ -36,31 +40,47 @@ class EigenSparseSolver : public SparseSolver {
void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override{};
Ndarray &x) override {
TI_NOT_IMPLEMENTED;
};
void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override {
TI_NOT_IMPLEMENTED;
};

bool info() override;
};

class CuSparseSolver : public SparseSolver {
private:
csrcholInfo_t info_{nullptr};
cusolverSpHandle_t cusolver_handle_{nullptr};
cusparseHandle_t cusparse_handel_{nullptr};
cusparseMatDescr_t descr_{nullptr};
void *gpu_buffer_{nullptr};

public:
CuSparseSolver();
~CuSparseSolver() override = default;
bool compute(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void analyze_pattern(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void factorize(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
void analyze_pattern(const SparseMatrix &sm) override;

void factorize(const SparseMatrix &sm) override;
Eigen::VectorXf solve(const Eigen::Ref<const Eigen::VectorXf> &b) override {
TI_NOT_IMPLEMENTED;
};
void solve_cu(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override;
void solve_rf(Program *prog,
const SparseMatrix &sm,
const Ndarray &b,
Ndarray &x) override;
bool info() override {
TI_NOT_IMPLEMENTED;
};
Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,7 @@ void export_lang(py::module &m) {
.def("factorize", &SparseSolver::factorize)
.def("solve", &SparseSolver::solve)
.def("solve_cu", &SparseSolver::solve_cu)
.def("solve_rf", &SparseSolver::solve_rf)
.def("info", &SparseSolver::info);

m.def("make_sparse_solver", &make_sparse_solver);
Expand Down
6 changes: 6 additions & 0 deletions taichi/rhi/cuda/cuda_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -555,4 +555,10 @@ typedef enum libraryPropertyType_t {

struct cusolverSpContext;
typedef struct cusolverSpContext *cusolverSpHandle_t;

// copy from cusolverSp_LOWLEVEL_PREVIEW.h
struct csrcholInfoHost;
typedef struct csrcholInfoHost *csrcholInfoHost_t;
struct csrcholInfo;
typedef struct csrcholInfo *csrcholInfo_t;
#endif
8 changes: 8 additions & 0 deletions taichi/rhi/cuda/cusolver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,11 @@ PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHa
PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *);
PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *);
PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *);

// cusolver preview API
PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*);
PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t );
PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *);
PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *);
PER_CUSOLVER_FUNCTION(csSpScsrcholZeroPivot, cusolverSpScsrcholZeroPivot, cusolverSpHandle_t, csrcholInfo_t ,float ,void *);
PER_CUSOLVER_FUNCTION(csSpScsrcholSolve, cusolverSpScsrcholSolve, cusolverSpHandle_t ,int ,void *,void *,csrcholInfo_t ,void *);
55 changes: 55 additions & 0 deletions tests/python/test_sparse_linear_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,58 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
x = solver.solve(b)
for i in range(n):
assert x[i] == test_utils.approx(res[i])


@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_solver():
from scipy.sparse import coo_matrix

@ti.kernel
def init_b(b: ti.types.ndarray(), nrows: ti.i32):
for i in range(nrows):
b[i] = 1.0 + i / nrows

"""
Generate a positive definite matrix with a given number of rows and columns.
Reference: https://stackoverflow.com/questions/619335/a-simple-algorithm-for-generating-positive-semidefinite-matrices
"""
matrixSize = 10
A = np.random.rand(matrixSize, matrixSize)
A_psd = np.dot(A, A.transpose())

A_raw_coo = coo_matrix(A_psd)
nrows, ncols = A_raw_coo.shape
nnz = A_raw_coo.nnz

A_csr = A_raw_coo.tocsr()
b = ti.ndarray(shape=nrows, dtype=ti.f32)
init_b(b, nrows)

# solve Ax = b using cusolver
A_coo = A_csr.tocoo()
d_row_coo = ti.ndarray(shape=nnz, dtype=ti.i32)
d_col_coo = ti.ndarray(shape=nnz, dtype=ti.i32)
d_val_coo = ti.ndarray(shape=nnz, dtype=ti.f32)
d_row_coo.from_numpy(A_coo.row)
d_col_coo.from_numpy(A_coo.col)
d_val_coo.from_numpy(A_coo.data)

A_ti = ti.linalg.SparseMatrix(n=nrows, m=ncols, dtype=ti.float32)
A_ti.build_coo(d_row_coo, d_col_coo, d_val_coo)
x_ti = ti.ndarray(shape=ncols, dtype=ti.float32)
solver = ti.linalg.SparseSolver()
solver.solve_cu(A_ti, b, x_ti)
ti.sync()

# solve Ax=b using numpy
b_np = b.to_numpy()
x_np = np.linalg.solve(A_psd, b_np)
assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1))

# solve Ax=b using cusolver refectorization
solver = ti.linalg.SparseSolver()
solver.analyze_pattern(A_ti)
solver.factorize(A_ti)
solver.solve_rf(A_ti, b, x_ti)
ti.sync()
assert (np.allclose(x_ti.to_numpy(), x_np, atol=1e-1))