From 33a606a0bc469d738b75d640a412c9acbd901494 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Sat, 17 Sep 2022 20:54:48 +0800
Subject: [PATCH] [Lang] Sort coo to build correct csr format sparse matrix on
 GPU (#6050)

Related issue = #2906

When building a coo format sparse matrix, the indices are not in order.
To build a valid csr format sparse matrix. We need to first sort the coo
indices arrays.
---
 taichi/program/sparse_matrix.cpp         | 38 ++++++++++++++++++++++--
 taichi/rhi/cuda/cuda_types.h             |  2 ++
 taichi/rhi/cuda/cusparse_functions.inc.h |  6 ++++
 tests/python/test_sparse_matrix.py       |  6 ++--
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 3e8096a98d72f..f142766111c64 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -203,11 +203,40 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
                                         void *coo_values_ptr,
                                         int nnz) {
 #if defined(TI_WITH_CUDA)
+  // Step 1: Sort coo first
+  cusparseHandle_t cusparse_handle = NULL;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  cusparseSpVecDescr_t vec_permutation;
+  cusparseDnVecDescr_t vec_values;
+  void *d_permutation = NULL, *d_values_sorted = NULL;
+  CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int));
+  CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float));
+  CUSPARSEDriver::get_instance().cpCreateSpVec(
+      &vec_permutation, nnz, nnz, d_permutation, d_values_sorted,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_values, nnz, coo_values_ptr,
+                                               CUDA_R_32F);
+  size_t bufferSize = 0;
+  CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt(
+      cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr,
+      &bufferSize);
+  void *dbuffer = NULL;
+  if (bufferSize > 0)
+    CUDADriver::get_instance().malloc(&dbuffer, bufferSize);
+  // Setup permutation vector to identity
+  CUSPARSEDriver::get_instance().cpCreateIdentityPermutation(
+      cusparse_handle, nnz, d_permutation);
+  CUSPARSEDriver::get_instance().cpXcoosortByRow(cusparse_handle, rows_, cols_,
+                                                 nnz, coo_row_ptr, coo_col_ptr,
+                                                 d_permutation, dbuffer);
+  CUSPARSEDriver::get_instance().cpGather(cusparse_handle, vec_values,
+                                          vec_permutation);
+  CUDADriver::get_instance().memcpy_device_to_device(
+      coo_values_ptr, d_values_sorted, nnz * sizeof(float));
+  // Step 2: coo to csr
   void *csr_row_offset_ptr = NULL;
   CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
                                     sizeof(int) * (rows_ + 1));
-  cusparseHandle_t cusparse_handle;
-  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
   CUSPARSEDriver::get_instance().cpCoo2Csr(
       cusparse_handle, (void *)coo_row_ptr, nnz, rows_,
       (void *)csr_row_offset_ptr, CUSPARSE_INDEX_BASE_ZERO);
@@ -216,9 +245,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
       &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
       coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
   CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
   // TODO: free csr_row_offset_ptr
   // CUDADriver::get_instance().mem_free(csr_row_offset_ptr);
+  CUDADriver::get_instance().mem_free(d_values_sorted);
+  CUDADriver::get_instance().mem_free(d_permutation);
+  CUDADriver::get_instance().mem_free(dbuffer);
 #endif
 }
 
diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h
index 64a369f5a76b5..88bb33951e3e7 100644
--- a/taichi/rhi/cuda/cuda_types.h
+++ b/taichi/rhi/cuda/cuda_types.h
@@ -441,8 +441,10 @@ typedef struct cusparseContext *cusparseHandle_t;
 struct cusparseMatDescr;
 typedef struct cusparseMatDescr *cusparseMatDescr_t;
 
+struct cusparseSpVecDescr;
 struct cusparseDnVecDescr;
 struct cusparseSpMatDescr;
+typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t;
 typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
 typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
 typedef enum {
diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h
index 7476b83c09473..1a70e36de7883 100644
--- a/taichi/rhi/cuda/cusparse_functions.inc.h
+++ b/taichi/rhi/cuda/cusparse_functions.inc.h
@@ -13,6 +13,12 @@ PER_CUSPARSE_FUNCTION(cpCreateMatDescr, cusparseCreateMatDescr, cusparseMatDescr
 PER_CUSPARSE_FUNCTION(cpSetMatType, cusparseSetMatType, cusparseMatDescr_t, cusparseMatrixType_t);
 PER_CUSPARSE_FUNCTION(cpSetMatIndexBase, cusparseSetMatIndexBase, cusparseMatDescr_t, cusparseIndexBase_t);
 PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateSpVec, cusparseCreateSpVec, cusparseSpVecDescr_t* ,int ,int,void*,void*,cusparseIndexType_t,cusparseIndexBase_t,cudaDataType);
+PER_CUSPARSE_FUNCTION(cpDestroySpVec, cusparseDestroySpVec, cusparseSpVecDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermutation, cusparseHandle_t, int, void*);
+PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
 
 // cusparse dense vector description
 PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType);
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 1a1973ee6fa35..10cf8451a25f0 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -379,9 +379,9 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
 
 @test_utils.test(arch=ti.cuda)
 def test_gpu_sparse_matrix():
-    h_coo_row = np.asarray([0, 0, 0, 1, 2, 2, 2, 3, 3], dtype=np.int32)
-    h_coo_col = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
-    h_coo_val = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
+    h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
+    h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
+    h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
                            dtype=np.float32)
     h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
     h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32)