diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index e772c688d1..fbf12ca40a 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -1 +1,2 @@ ADD_SUBDIRECTORY(fenl) +#ADD_SUBDIRECTORY(graph) diff --git a/example/Makefile b/example/Makefile index cd4025b786..45830c1304 100644 --- a/example/Makefile +++ b/example/Makefile @@ -49,9 +49,7 @@ TARGETS = TEST_HEADERS += $(wildcard $(KOKKOSKERNELS_SRC_PATH)/example/hashmap_accumulator/*.hpp) EXECUTABLES += $(wildcard ${KOKKOSKERNELS_SRC_PATH}/example/hashmap_accumulator/*.cpp) -TEST_HEADERS += $(wildcard $(KOKKOSKERNELS_SRC_PATH)/example/graph/*.hpp) -EXECUTABLES += $(wildcard ${KOKKOSKERNELS_SRC_PATH}/example/graph/*.cpp) - +#EXECUTABLES += $(wildcard ${KOKKOSKERNELS_SRC_PATH}/example/graph/PartitioningExample.cpp) #======================================================================= #========================== TARGETS ==================================== diff --git a/example/graph/CMakeLists.txt b/example/graph/CMakeLists.txt new file mode 100644 index 0000000000..fad1391673 --- /dev/null +++ b/example/graph/CMakeLists.txt @@ -0,0 +1,10 @@ +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + +SET(SOURCES "PartitioningExample.cpp") + +TRIBITS_ADD_EXECUTABLE( + PartitioningExample + SOURCES ${SOURCES} + COMM serial + ) diff --git a/example/graph/PartitioningExample b/example/graph/PartitioningExample new file mode 100755 index 0000000000..88619a8d12 Binary files /dev/null and b/example/graph/PartitioningExample differ diff --git a/example/graph/PartitioningExample.cpp b/example/graph/PartitioningExample.cpp new file mode 100644 index 0000000000..0587220f40 --- /dev/null +++ b/example/graph/PartitioningExample.cpp @@ -0,0 +1,148 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions Contact: William McLendon (wcmclen@sandia.gov) or +// Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::vector; + +//#include "../../src/sparse/impl/KokkosSparse_partitioning_impl.hpp" + +int main(int argc, char *argv[]) +{ + /* + const int device_id = 0; + + Kokkos::initialize(Kokkos::InitArguments(1, -1, device_id)); + + if(params.verbose) + { + Kokkos::print_configuration(std::cout); + } + */ + + //Generate a square 2D mesh in serial, with edges in both diagonals + const int xy = 31; + const int nnodes = (xy + 1) * (xy + 1); + vector adjMat(nnodes * nnodes, false); + //Number of nodes is (n+1)^2 + for(int cellX = 0; cellX < xy; cellX++) + { + for(int cellY = 0; cellY < xy; cellY++) + { + int upLeft = cellX + (xy + 1) * cellY; + int upRight = cellX + 1 + (xy + 1) * cellY; + int downLeft = cellX + (xy + 1) * (cellY + 1); + int downRight = cellX + 1 + (xy + 1) * (cellY + 1); + #define CONNECT(n1, n2) \ + adjMat[n1 + n2 * nnodes] = true; \ + adjMat[n2 + n1 * nnodes] = true; + //Form this pattern in each cell: + // + // +------+ + // |\ /| + // | \ / | + // | \/ | + // | /\ | + // | / \ | + // |/ \| + // +------+ + // + CONNECT(upLeft, upRight); + CONNECT(upLeft, downLeft); + CONNECT(upLeft, downRight); + CONNECT(upRight, downRight); + CONNECT(downLeft, downRight); + CONNECT(downLeft, upRight); + } + } + + //Build a sparse (CRS) graph from the dense adjacency matrix + int numEdges = 0; + for(size_t i = 0; i < adjMat.size(); i++) + numEdges += (adjMat[i] ? 1 : 0); + + /* + Kokkos::View rowmap("Rowmap", nnodes + 1); + Kokkos::View entries("Entries", numEdges); + int accum = 0; + for(int r = 0; r <= nnodes; r++) + { + rowmap(r) = accum; + if(r == nnodes) + break; + for(int c = 0; c < nnodes; c++) + { + if(adjMat[c + r * nnodes]) + entries(accum++) = c; + } + } + */ + + //Dump the graph to a graphviz file + FILE* g = fopen("graph.dot", "w"); + fprintf(g, "graph {\n"); + for(int r = 0; r < nnodes; r++) + { + for(int c = r; c < nnodes; c++) + { + if(adjMat[c + r * nnodes]) + fprintf(g, "n%d -- n%d\n", r, c); + } + } + fprintf(g, "}\n"); + fclose(g); + //Kokkos::finalize(); + return 0; +} + diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 77d7afec4f..f03590c5c2 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -6,6 +6,11 @@ TRIBITS_ADD_EXECUTABLE( SOURCES KokkosSparse_pcg.cpp ) +TRIBITS_ADD_EXECUTABLE( + sparse_block_pcg + SOURCES KokkosSparse_block_pcg.cpp + ) + TRIBITS_ADD_EXECUTABLE( sparse_spgemm SOURCES KokkosSparse_spgemm.cpp @@ -28,7 +33,12 @@ TRIBITS_ADD_EXECUTABLE( SOURCES KokkosSparse_sptrsv.cpp ) +TRIBITS_ADD_EXECUTABLE( + sparse_gs + SOURCES KokkosSparse_gs.cpp + ) + TRIBITS_ADD_EXECUTABLE( sparse_spiluk SOURCES KokkosSparse_spiluk.cpp - ) \ No newline at end of file + ) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp new file mode 100644 index 0000000000..c3ee9b1f06 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::string; + +template +void runGS(string matrixPath, string devName, bool symmetric) +{ + typedef double scalar_t; + typedef typename device_t::execution_space exec_space; + typedef typename device_t::memory_space mem_space; + typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; + typedef typename KokkosSparse::CrsMatrix crsmat_t; + //typedef typename crsmat_t::StaticCrsGraphType graph_t; + typedef typename crsmat_t::values_type::non_const_type scalar_view_t; + //typedef typename graph_t::row_map_type::non_const_type lno_view_t; + //typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix(matrixPath.c_str()); + lno_t nrows = A.numRows(); + lno_t ncols = A.numCols(); + if(nrows != ncols) + { + throw std::runtime_error("Gauss_Seidel only works for square matrices"); + } + //size_type nnz = A.nnz(); + KernelHandle kh; + //use a random RHS - uniformly distributed over (-5, 5) + scalar_view_t b("b", nrows); + { + srand(54321); + auto bhost = Kokkos::create_mirror_view(b); + for(lno_t i = 0; i < nrows; i++) + { + bhost(i) = 10.0 * rand() / RAND_MAX - 5.0; + } + Kokkos::deep_copy(b, bhost); + } + kh.set_team_work_size(16); + kh.set_dynamic_scheduling(true); + //initial LHS is 0 + scalar_view_t x("x", nrows); + //Data to dump to CSV + //cluster size sequence: 1, 2, 3, 5, 8, 12, 18, ... up to 1000, or < 96 clusters (whichever comes first) + std::vector clusterSizes; + //how long symbolic/numeric phases take (the graph reuse case isn't that interesting since numeric doesn't do much) + std::vector symbolicTimes; + std::vector numericTimes; + std::vector applyTimes; + std::vector scaledRes; + //the initial residual norm (for x = 0) is bnorm + double bnorm = KokkosBlas::nrm2(b); + Kokkos::Timer timer; + clusterSizes.push_back(1); + clusterSizes.push_back(4); + clusterSizes.push_back(8); + clusterSizes.push_back(16); + //for(int clusterSize = 1; clusterSize <= nrows / 64 && clusterSize <= 1000; clusterSize = ceil(1.5 * clusterSize)) + // clusterSizes.push_back(clusterSize); + for(int clusterSize : clusterSizes) + { + //cluster size of 1 is standard multicolor GS + if(clusterSize == 1) + { + std::cout << "\n\n***** RUNNING POINT COLORING SGS\n"; + //this constructor is for point coloring + kh.create_gs_handle(KokkosSparse::GS_DEFAULT); + } + else + { + std::cout << "\n\n***** RUNNING CLUSTER SGS, cluster size = " << clusterSize << "\n"; + //this constructor is for cluster (block) coloring + //kh.create_gs_handle(KokkosSparse::CLUSTER_CUTHILL_MCKEE, clusterSize); + //kh.create_gs_handle(KokkosSparse::CLUSTER_DEFAULT, clusterSize); + //kh.create_gs_handle(KokkosSparse::CLUSTER_DO_NOTHING, clusterSize); + kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize); + } + timer.reset(); + KokkosSparse::Experimental::gauss_seidel_symbolic// + (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, symmetric); + symbolicTimes.push_back(timer.seconds()); + std::cout << "\n*** symbolic time: " << symbolicTimes.back() << '\n'; + timer.reset(); + KokkosSparse::Experimental::gauss_seidel_numeric// + (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, symmetric); + numericTimes.push_back(timer.seconds()); + std::cout << "\n*** numeric time: " << numericTimes.back() << '\n'; + timer.reset(); + //Last two parameters are damping factor (should be 1) and sweeps + KokkosSparse::Experimental::symmetric_gauss_seidel_apply + (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, true, true, 1.0, 1); + applyTimes.push_back(timer.seconds()); + std::cout << "\n*** apply time: " << applyTimes.back() << '\n'; + //Now, compute the 2-norm of residual + scalar_view_t res("Ax-b", nrows); + Kokkos::deep_copy(res, b); + typedef Kokkos::Details::ArithTraits KAT; + scalar_t alpha = KAT::one(); + scalar_t beta = -KAT::one(); + KokkosSparse::spmv + ("N", alpha, A, x, beta, res); + double resnorm = KokkosBlas::nrm2(res); + //note: this still works if the solution diverges + scaledRes.push_back(resnorm / bnorm); + kh.destroy_gs_handle(); + } + string csvName = "gs_perf_" + devName + ".csv"; + std::cout << "Writing results to " << csvName << "\n"; + FILE* csvDump = fopen(csvName.c_str(), "w"); + fprintf(csvDump, "ClusterSize,Symbolic,Numeric,Apply,Residual\n"); + for(size_t i = 0; i < clusterSizes.size(); i++) + { + fprintf(csvDump, "%d,%.5e,%.5e,%.5e,%.5e\n", + clusterSizes[i], symbolicTimes[i], numericTimes[i], applyTimes[i], scaledRes[i]); + } + fclose(csvDump); +} + +int main(int argc, char** argv) +{ + //Expect two args: matrix name and device flag. + if(argc != 3 && argc != 4) + { + std::cout << "Usage: ./sparse_gs matrix.mtx [--device] [--symmetric]\n\n"; + std::cout << "device can be \"serial\", \"openmp\", \"cuda\" or \"threads\".\n"; + std::cout << "If device is not given, the default device is used.\n"; + std::cout << "Add the --symmetric flag if the matrix is known to be symmetric.\n"; + return 0; + } + string device; + string matrixPath; + bool sym = false; + for(int i = 1; i < argc; i++) + { + if(!strcmp(argv[i], "--symmetric")) + sym = true; + else if(!strcmp(argv[i], "--serial")) + device = "serial"; + else if(!strcmp(argv[i], "--openmp")) + device = "openmp"; + else if(!strcmp(argv[i], "--threads")) + device = "threads"; + else if(!strcmp(argv[i], "--cuda")) + device = "cuda"; + else + matrixPath = argv[i]; + } + //No device given, so use the default one + if(!device.length()) + { + #ifdef KOKKOS_ENABLE_SERIAL + if(std::is_same::value) + device = "serial"; + #endif + #ifdef KOKKOS_ENABLE_OPENMP + if(std::is_same::value) + device = "openmp"; + #endif + #ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + device = "cuda"; + #endif + #ifdef KOKKOS_ENABLE_THREADS + if(std::is_same::value) + device = "threads"; + #endif + } + Kokkos::initialize(); + bool run = false; + #ifdef KOKKOS_ENABLE_SERIAL + if(device == "serial") + { + runGS(matrixPath, device, sym); + run = true; + } + #endif + #ifdef KOKKOS_ENABLE_OPENMP + if(device == "openmp") + { + runGS(matrixPath, device, sym); + run = true; + } + #endif + #ifdef KOKKOS_ENABLE_THREADS + if(device == "threads") + { + runGS(matrixPath, device, sym); + run = true; + } + #endif + #ifdef KOKKOS_ENABLE_CUDA + if(device == "cuda") + { + runGS(matrixPath, device, sym); + run = true; + } + #endif + if(!run) + { + std::cerr << "Error: device " << device << " was requested but it's not enabled.\n"; + return 1; + } + Kokkos::finalize(); + return 0; +} + diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 062805d7a7..97a26e3964 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -85,37 +85,31 @@ vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){ template void run_experiment( - crsMat_t crsmat){ - - + crsMat_t crsmat, + int clusterSize, + bool useSequential) +{ typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef typename lno_nnz_view_t::value_type lno_t; typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; INDEX_TYPE nv = crsmat.numRows(); scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); - - KokkosKernels::Impl::print_1Dview(kok_x_original); scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); //create X vector scalar_view_t kok_x_vector("kok_x_vector", nv); - double solve_time = 0; const unsigned cg_iteration_limit = 100000; const double cg_iteration_tolerance = 1e-7 ; KokkosKernels::Experimental::Example::CGSolveResult cg_result ; - - - typedef KokkosKernels::Experimental::KokkosKernelsHandle < size_type, lno_t, @@ -124,8 +118,10 @@ void run_experiment( KernelHandle kh; - - kh.create_gs_handle(); + if(clusterSize == 1) + kh.create_gs_handle(); + else + kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize); Kokkos::Impl::Timer timer1; KokkosKernels::Experimental::Example::pcgsolve( kh @@ -136,13 +132,25 @@ void run_experiment( , cg_iteration_tolerance , & cg_result , true + , clusterSize + , useSequential ); Kokkos::fence(); solve_time = timer1.seconds(); + std::string algoSummary; + if(useSequential) + algoSummary = "SEQUENTIAL SGS"; + else + { + if(clusterSize == 1) + algoSummary = "POINT-COLORING SGS"; + else + algoSummary = "CLUSTER-COLORING SGS (CLUSTER SIZE " + std::to_string(clusterSize) + ")"; + } - std::cout << "DEFAULT SOLVE:" + std::cout << "DEFAULT SOLVE: " << algoSummary << " PRECONDITIONER" << "\n\t(P)CG_NUM_ITER [" << cg_result.iteration << "]" << "\n\tMATVEC_TIME [" << cg_result.matvec_time << "]" << "\n\tCG_RESIDUAL [" << cg_result.norm_res << "]" @@ -256,17 +264,17 @@ enum { CMD_USE_THREADS = 0 , CMD_USE_OPENMP , CMD_USE_CUDA_DEV , CMD_BIN_MTX + , CMD_CLUSTER_SIZE + , CMD_USE_SEQUENTIAL_SGS , CMD_ERROR , CMD_COUNT }; int main (int argc, char ** argv){ - int cmdline[ CMD_COUNT ] ; char *mtx_bin_file = NULL; for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ; - for ( int i = 1 ; i < argc ; ++i ) { if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] ); @@ -286,6 +294,12 @@ int main (int argc, char ** argv){ cmdline[ CMD_USE_CUDA ] = 1 ; cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; } + else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) { + cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]); + } + else if ( 0 == strcasecmp( argv[i] , "--seq-gs" ) ) { + cmdline[CMD_USE_SEQUENTIAL_SGS] = 1; + } else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) { mtx_bin_file = argv[++i]; @@ -298,6 +312,9 @@ int main (int argc, char ** argv){ return 0; } } + //default cluster size is always 1 (this runs point coloring GS) + if(cmdline[CMD_CLUSTER_SIZE] == 0) + cmdline[CMD_CLUSTER_SIZE] = 1; if (mtx_bin_file == NULL){ std::cerr << "Provide a mtx binary file" << std::endl ; @@ -307,10 +324,6 @@ int main (int argc, char ** argv){ } - - - - #if defined( KOKKOS_ENABLE_THREADS ) if ( cmdline[ CMD_USE_THREADS ] ) { @@ -328,36 +341,37 @@ int main (int argc, char ** argv){ Kokkos::initialize( init_args ); Kokkos::print_configuration(std::cout); + { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef Kokkos::Threads myExecSpace; + typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + KokkosKernels::Impl::copy_vector(ne, ew, values_view); + KokkosKernels::Impl::copy_vector(ne, adj, columns_view); + KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + delete [] xadj; + delete [] adj; + delete [] ew; - run_experiment(crsmat); + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); + } Kokkos::finalize(); } @@ -380,40 +394,40 @@ int main (int argc, char ** argv){ Kokkos::initialize( init_args ); Kokkos::print_configuration(std::cout); + { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; + typedef Kokkos::OpenMP myExecSpace; + typedef typename KokkosSparse::CrsMatrix crsMat_t; - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsMat_t::index_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + KokkosKernels::Impl::copy_vector(ne, ew, values_view); + KokkosKernels::Impl::copy_vector(ne, adj, columns_view); + KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - run_experiment(crsmat); + //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); + delete [] xadj; + delete [] adj; + delete [] ew; + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); + } Kokkos::finalize(); } #endif @@ -428,57 +442,57 @@ int main (int argc, char ** argv){ Kokkos::initialize( init_args ); Kokkos::print_configuration(std::cout); + { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); + typedef Kokkos::Cuda myExecSpace; + typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsMat_t::index_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); + { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + for (INDEX_TYPE i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + + for (INDEX_TYPE i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + // typedef typename KokkosSparse::CrsMatrix crsMat_t; + // crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); + delete [] xadj; + delete [] adj; + delete [] ew; + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - -// typedef typename KokkosSparse::CrsMatrix crsMat_t; -// crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat); - Kokkos::finalize(); } #endif diff --git a/perf_test/sparse/KokkosSparse_pcg.hpp b/perf_test/sparse/KokkosSparse_pcg.hpp index 90b0895478..410633c661 100644 --- a/perf_test/sparse/KokkosSparse_pcg.hpp +++ b/perf_test/sparse/KokkosSparse_pcg.hpp @@ -57,6 +57,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -84,19 +85,18 @@ template< typename KernelHandle_t, void block_pcgsolve( KernelHandle_t &kh , const crsMatrix_t &point_crsMat - , const crsMatrix_t &_block_crsMat, int block_size + , const crsMatrix_t &_block_crsMat, int block_size , const y_vector_t &y_vector , x_vector_t x_vector , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits::epsilon() , CGSolveResult * result = 0 - , bool use_sgs = true) { - + , bool use_sgs = true) +{ using namespace KokkosSparse; using namespace KokkosSparse::Experimental; typedef typename KernelHandle_t::HandleExecSpace Space; - const size_t count_total = point_crsMat.numRows(); size_t iteration = 0 ; @@ -116,16 +116,21 @@ void block_pcgsolve( y_vector_t r ( "cg::r" , count_total ); y_vector_t Ap( "cg::Ap", count_total ); - /* r = b - A * x ; */ - /* p = x */ Kokkos::deep_copy( p , x_vector ); + // r = b - A * x ; + // p = x + Kokkos::deep_copy( p , x_vector ); - /* Ap = A * p */ KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap); + // Ap = A * p + KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap); - /* r = Ap */ Kokkos::deep_copy( r , Ap ); + // r = Ap + Kokkos::deep_copy( r , Ap ); - /* r = b - r */ KokkosBlas::axpby(1.0, y_vector, -1.0, r); + // r = b - r + KokkosBlas::axpby(1.0, y_vector, -1.0, r); - /* p = r */ Kokkos::deep_copy( p , r ); + // p = r + Kokkos::deep_copy( p , r ); ; double old_rdot = KokkosBlas::dot( r , r ); norm_res = sqrt( old_rdot ); @@ -138,7 +143,9 @@ void block_pcgsolve( bool owner_handle = false; - KernelHandle_t block_kh; block_kh.create_gs_handle(); block_kh.get_gs_handle()->set_block_size(block_size); + KernelHandle_t block_kh; + block_kh.create_gs_handle(); + block_kh.get_point_gs_handle()->set_block_size(block_size); //block_kh.set_shmem_size(8032); if (use_sgs){ if (kh.get_gs_handle() == NULL){ @@ -147,13 +154,12 @@ void block_pcgsolve( } timer.reset(); -/* - gauss_seidel_numeric - (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values); - Space().fence(); - timer.reset(); -*/ + //gauss_seidel_numeric + // (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values); + + //Space().fence(); + //timer.reset(); //block_kh.set_verbose(true); block_gauss_seidel_numeric @@ -166,11 +172,10 @@ void block_pcgsolve( timer.reset(); symmetric_block_gauss_seidel_apply (&block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(),block_size, _block_crsMat.graph.row_map, _block_crsMat.graph.entries, _block_crsMat.values, - z, r, true, true, apply_count); -/* - symmetric_gauss_seidel_apply - (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values, z, r, true, true, apply_count); -*/ + z, r, true, true, 1.0, apply_count); + + //symmetric_gauss_seidel_apply + // (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values, z, r, true, true, apply_count); Space().fence(); precond_time += timer.seconds(); precond_old_rdot = KokkosBlas::dot( r , z ); @@ -188,7 +193,8 @@ void block_pcgsolve( timer.reset(); - /* Ap = A * p */ KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap); + //Ap = A * p + KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap); Space().fence(); @@ -197,7 +203,8 @@ void block_pcgsolve( //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm ); //const double pAp_dot = dot( count_total , p , Ap ) ; - /* pAp_dot = dot(Ap , p ) */ const double pAp_dot = KokkosBlas::dot( p , Ap ) ; + // pAp_dot = dot(Ap , p); + const double pAp_dot = KokkosBlas::dot( p , Ap ) ; double alpha = 0; @@ -208,9 +215,11 @@ void block_pcgsolve( alpha = old_rdot / pAp_dot ; } - /* x += alpha * p ; */ KokkosBlas::axpby(alpha, p, 1.0, x_vector); + // x += alpha * p ; + KokkosBlas::axpby(alpha, p, 1.0, x_vector); - /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap, 1.0, r); + // r += -alpha * Ap ; + KokkosBlas::axpby(-alpha, Ap, 1.0, r); const double r_dot = KokkosBlas::dot( r , r ); @@ -222,17 +231,15 @@ void block_pcgsolve( timer.reset(); symmetric_block_gauss_seidel_apply (&block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(),block_size, _block_crsMat.graph.row_map, _block_crsMat.graph.entries, _block_crsMat.values, - z, r, true, true, apply_count); - /* - - symmetric_gauss_seidel_apply( - &kh, - count_total, count_total, - point_crsMat.graph.row_map, - point_crsMat.graph.entries, - point_crsMat.values, z, r, true, - apply_count); - */ + z, r, true, true, 1.0, apply_count); + + //symmetric_gauss_seidel_apply( + // &kh, + // count_total, count_total, + // point_crsMat.graph.row_map, + // point_crsMat.graph.entries, + // point_crsMat.values, z, r, true, + // apply_count); Space().fence(); precond_time += timer.seconds(); @@ -243,7 +250,8 @@ void block_pcgsolve( double beta = 1; if (!use_sgs){ beta = beta_original; - /* p = r + beta * p ; */ KokkosBlas::axpby(1.0, r, beta, p); + // p = r + beta * p ; + KokkosBlas::axpby(1.0, r, beta, p); } else { beta = precond_beta; @@ -257,7 +265,6 @@ void block_pcgsolve( #endif - norm_res = sqrt( old_rdot = r_dot ); precond_old_rdot = precond_r_dot; @@ -285,7 +292,6 @@ void block_pcgsolve( } } - template< typename KernelHandle_t, typename crsMatrix_t, typename y_vector_t, @@ -297,16 +303,21 @@ void pcgsolve( , const y_vector_t &y_vector , x_vector_t x_vector , const size_t maximum_iteration = 200 - , const double tolerance = std::numeric_limits::epsilon() + , const double tolerance = std::numeric_limits::epsilon() , CGSolveResult * result = 0 - , bool use_sgs = true) { - + , bool use_sgs = true + , int clusterSize = 1 + , bool use_sequential_sgs = false) +{ using namespace KokkosSparse; using namespace KokkosSparse::Experimental; - typedef typename KernelHandle_t::HandleExecSpace Space; + using size_type = typename KernelHandle_t::size_type; + using nnz_lno_t = typename KernelHandle_t::nnz_lno_t; + using Space = typename KernelHandle_t::HandleExecSpace; + static_assert(std::is_same::value, + "The PCG performance test only works with scalar = double."); - - const size_t count_total = crsMat.numRows(); + const nnz_lno_t count_total = crsMat.numRows(); size_t iteration = 0 ; double iter_time = 0 ; @@ -335,7 +346,7 @@ void pcgsolve( /* r = b - r */ KokkosBlas::axpby(1.0, y_vector, -1.0, r); /* p = r */ Kokkos::deep_copy( p , r ); -; + double old_rdot = KokkosBlas::dot( r , r ); norm_res = sqrt( old_rdot ); @@ -345,33 +356,79 @@ void pcgsolve( double precond_old_rdot = 1; //Kokkos::deep_copy( p , z ); - bool owner_handle = false; - if (use_sgs){ - if (kh.get_gs_handle() == NULL){ - owner_handle = true; - kh.create_gs_handle(); + bool use_par_sgs = use_sgs && !use_sequential_sgs; + + auto ptrHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.graph.row_map); + auto indHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.graph.entries); + auto valHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.values); + Kokkos::View diagHost; + if(use_sequential_sgs) + { + diagHost = Kokkos::View("Diag for Seq SOR", count_total); + for(int i = 0; i < count_total; i++) + { + for(size_type j = ptrHost(i); j < ptrHost(i + 1); j++) + { + if(indHost(j) == i) + diagHost(i) = 1.0 / valHost(j); + } } - + } + auto xHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_vector); + auto yHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_vector); + + if(use_sgs) { timer.reset(); - //kh.set_verbose(true); - - gauss_seidel_numeric - (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values); + z = y_vector_t( "pcg::z" , count_total ); + if (use_par_sgs) { + gauss_seidel_numeric + (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values); - Space().fence(); + Space().fence(); - precond_init_time += timer.seconds(); - z = y_vector_t( "pcg::z" , count_total ); - Space().fence(); - timer.reset(); + precond_init_time += timer.seconds(); + Space().fence(); + timer.reset(); - symmetric_gauss_seidel_apply - (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values, z, r, true, true, apply_count); + symmetric_gauss_seidel_apply + (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values, z, r, true, true, 1.0, apply_count); - Space().fence(); + Space().fence(); + } + else if(use_sequential_sgs) { + //z = LHS (aka x), r RHS (aka y or b) + Kokkos::deep_copy(z, 0.0); + auto zhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), z); + auto rhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + //as with par_sgs, init unknown to 0 + timer.reset(); + for(int sweep = 0; sweep < apply_count; sweep++) + { + KokkosSparse::Impl::Sequential::gaussSeidel + (count_total, // rows = cols of the matrix + 1, // number of vectors in X and B + ptrHost.data(), indHost.data(), valHost.data(), + rhost.data(), count_total, //raw ptr to B vector, and B column stride (for when multiple RHS gets added to MTSGS) + zhost.data(), count_total, //raw ptr to X vector, and X column stride + diagHost.data(), + 1.0, + "F"); + KokkosSparse::Impl::Sequential::gaussSeidel + (count_total, 1, + ptrHost.data(), indHost.data(), valHost.data(), + rhost.data(), count_total, + zhost.data(), count_total, + diagHost.data(), + 1.0, + "B"); + } + //result is in z (but r doesn't change) + Kokkos::deep_copy(z, zhost); + Kokkos::deep_copy(r, rhost); + } precond_time += timer.seconds(); - precond_old_rdot = KokkosBlas::dot( r , z ); - Kokkos::deep_copy( p , z ); + precond_old_rdot = KokkosBlas::dot(r , z); + Kokkos::deep_copy(p, z); } iteration = 0 ; @@ -382,6 +439,7 @@ void pcgsolve( #endif while (tolerance < norm_res && iteration < maximum_iteration ) { + std::cout << "Running CG iteration " << iteration << ", current resnorm = " << norm_res << '\n'; timer.reset(); @@ -396,7 +454,6 @@ void pcgsolve( /* pAp_dot = dot(Ap , p ) */ const double pAp_dot = KokkosBlas::dot( p , Ap ) ; - double alpha = 0; if (use_sgs){ alpha = precond_old_rdot / pAp_dot ; @@ -414,25 +471,55 @@ void pcgsolve( const double beta_original = r_dot / old_rdot ; double precond_r_dot = 1; double precond_beta = 1; - if (use_sgs){ + if(use_sgs) + { Space().fence(); timer.reset(); - symmetric_gauss_seidel_apply( - &kh, - count_total, count_total, - crsMat.graph.row_map, - crsMat.graph.entries, - crsMat.values, z, r, true, - apply_count); - - Space().fence(); + if (use_par_sgs) + { + symmetric_gauss_seidel_apply( + &kh, + count_total, count_total, + crsMat.graph.row_map, + crsMat.graph.entries, + crsMat.values, z, r, true, true, + 1.0, apply_count); + } + else if(use_sequential_sgs) + { + //z = LHS (aka x), r RHS (aka y or b) + Kokkos::deep_copy(z, 0.0); + auto zhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), z); + auto rhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + //as with the par_sgs version, init unknown (here, zhost) to 0 + for(int sweep = 0; sweep < apply_count; sweep++) + { + KokkosSparse::Impl::Sequential::gaussSeidel + (count_total, 1, + ptrHost.data(), indHost.data(), valHost.data(), + rhost.data(), count_total, + zhost.data(), count_total, + diagHost.data(), + 1.0, + "F"); + KokkosSparse::Impl::Sequential::gaussSeidel + (count_total, 1, + ptrHost.data(), indHost.data(), valHost.data(), + rhost.data(), count_total, + zhost.data(), count_total, + diagHost.data(), + 1.0, + "B"); + } + Kokkos::deep_copy(z, zhost); + Kokkos::deep_copy(r, rhost); + } precond_time += timer.seconds(); precond_r_dot = KokkosBlas::dot(r , z ); precond_beta = precond_r_dot / precond_old_rdot ; } - - double beta = 1; - if (!use_sgs){ + double beta = 1; + if (!use_sgs) { beta = beta_original; /* p = r + beta * p ; */ KokkosBlas::axpby(1.0, r, beta, p); } @@ -448,7 +535,6 @@ void pcgsolve( #endif - norm_res = sqrt( old_rdot = r_dot ); precond_old_rdot = precond_r_dot; @@ -469,11 +555,6 @@ void pcgsolve( result->precond_time = precond_time; result->precond_init_time = precond_init_time; } - - if (use_sgs & owner_handle ){ - - kh.destroy_gs_handle(); - } } } // namespace Example diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 82b17821d0..27739af0a6 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -173,6 +173,14 @@ class KokkosKernelsHandle GaussSeidelHandle GaussSeidelHandleType; + //These are subclasses of GaussSeidelHandleType. + typedef typename KokkosSparse:: + PointGaussSeidelHandle + PointGaussSeidelHandleType; + typedef typename KokkosSparse:: + ClusterGaussSeidelHandle + ClusterGaussSeidelHandleType; + typedef typename KokkosSparse:: SPGEMMHandle SPGEMMHandleType; @@ -489,14 +497,30 @@ class KokkosKernelsHandle - GaussSeidelHandleType *get_gs_handle(){ + GaussSeidelHandleType *get_gs_handle() { return this->gsHandle; } - void create_gs_handle( - KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT){ + PointGaussSeidelHandleType *get_point_gs_handle() { + auto pgs = dynamic_cast(this->gsHandle); + if(this->gsHandle && !pgs) + throw std::runtime_error("GaussSeidelHandle exists but is not set up for point-coloring GS."); + return pgs; + } + ClusterGaussSeidelHandleType *get_cluster_gs_handle() { + auto cgs = dynamic_cast(this->gsHandle); + if(this->gsHandle && !cgs) + throw std::runtime_error("GaussSeidelHandle exists but is not set up for cluster-coloring GS."); + return cgs; + } + void create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; - this->gsHandle = new GaussSeidelHandleType(gs_algorithm); + this->gsHandle = new PointGaussSeidelHandleType(gs_algorithm); + } + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t verts_per_cluster) { + this->destroy_gs_handle(); + this->is_owner_of_the_gs_handle = true; + this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, verts_per_cluster); } void destroy_gs_handle(){ if (is_owner_of_the_gs_handle && this->gsHandle != NULL){ @@ -508,8 +532,6 @@ class KokkosKernelsHandle } } - - SPADDHandleType *get_spadd_handle(){ return this->spaddHandle; } diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index 13ceaf99fd..81dc603a82 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -233,6 +233,22 @@ int get_suggested_team_size(Functor& f, int vector_size) #endif //ifdef KOKKOS_ENABLE_DEPRECATED_CODE ... else +template +int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) +{ +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + { + team_policy_t temp = team_policy_t(1, 1, vector_size). + set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); + return temp.team_size_recommended(f, ParallelTag()); + } + else +#endif + { + return 1; + } +} template void zero_vector( typename value_array_type::value_type num_elements, diff --git a/src/sparse/KokkosSparse_gauss_seidel.hpp b/src/sparse/KokkosSparse_gauss_seidel.hpp index df61c2345d..5ee15d6008 100644 --- a/src/sparse/KokkosSparse_gauss_seidel.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel.hpp @@ -116,10 +116,14 @@ namespace KokkosSparse{ typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, - bool is_graph_symmetric = true){ - - - handle->get_gs_handle()->set_block_size(block_size); + bool is_graph_symmetric = true) + { + auto gsHandle = handle->get_point_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER"); + } + gsHandle->set_block_size(block_size); gauss_seidel_symbolic(handle, num_rows, num_cols, @@ -277,7 +281,12 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, bool is_graph_symmetric = true ){ - handle->get_gs_handle()->set_block_size(block_size); + auto gsHandle = handle->get_point_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER"); + } + gsHandle->set_block_size(block_size); gauss_seidel_numeric(handle, num_rows,num_cols, @@ -300,10 +309,10 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ static_assert (std::is_same::value, @@ -419,11 +428,16 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ - handle->get_gs_handle()->set_block_size(block_size); + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ + auto gsHandle = handle->get_point_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER"); + } + gsHandle->set_block_size(block_size); symmetric_gauss_seidel_apply(handle,num_rows,num_cols, row_map, entries, @@ -449,10 +463,10 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ static_assert (std::is_same::value, @@ -570,11 +584,16 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ - handle->get_gs_handle()->set_block_size(block_size); + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ + auto gsHandle = handle->get_point_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER"); + } + gsHandle->set_block_size(block_size); forward_sweep_gauss_seidel_apply(handle,num_rows,num_cols, row_map, entries, @@ -600,10 +619,10 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ static_assert (std::is_same::value, @@ -718,11 +737,16 @@ namespace KokkosSparse{ scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, - bool init_zero_x_vector = false, - bool update_y_vector = true, - typename KernelHandle::nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - int numIter = 1){ - handle->get_gs_handle()->set_block_size(block_size); + bool init_zero_x_vector, + bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, + int numIter){ + auto gsHandle = handle->get_point_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER"); + } + gsHandle->set_block_size(block_size); backward_sweep_gauss_seidel_apply(handle,num_rows,num_cols, row_map, entries, diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 1c70e44a2d..33b1f4d017 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -50,7 +50,25 @@ namespace KokkosSparse{ - enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM}; + enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER}; + enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_BALLOON, CLUSTER_CUTHILL_MCKEE, CLUSTER_DO_NOTHING, NUM_CLUSTERING_ALGORITHMS}; + + inline const char* getClusterAlgoName(ClusteringAlgorithm ca) + { + switch(ca) + { + case CLUSTER_DEFAULT: + return "Default"; + case CLUSTER_BALLOON: + return "Balloon"; + case CLUSTER_CUTHILL_MCKEE: + return "Cuthill-McKee"; + case CLUSTER_DO_NOTHING: + return "No-op"; + default:; + } + return "INVALID CLUSTERING ALGORITHM"; + } template algorithm_type;} + + virtual bool is_owner_of_coloring() const {return false;} + + nnz_lno_persistent_work_host_view_t get_color_xadj() const { + return this->color_xadj; + } + nnz_lno_persistent_work_view_t get_color_adj() const { + return this->color_adj; + } + nnz_lno_t get_num_colors() const { + return this->numColors; + } + + bool is_symbolic_called() const { + return this->called_symbolic; + } + bool is_numeric_called() const { + return this->called_numeric; + } + + //setters + void set_algorithm_type(const GSAlgorithm sgs_algo){ + this->algorithm_type = sgs_algo; + this->called_symbolic = false; + } + + void set_call_symbolic(bool call = true) { + this->called_symbolic = call; + } + void set_call_numeric(bool call = true) { + this->called_numeric = call; + } + + void set_color_xadj(const nnz_lno_persistent_work_host_view_t &color_xadj_) { + this->color_xadj = color_xadj_; + } + void set_color_adj(const nnz_lno_persistent_work_view_t &color_adj_) { + this->color_adj = color_adj_; + } + void set_num_colors(const nnz_lno_t &numColors_) { + this->numColors = numColors_; + } + + void vector_team_size( + int max_allowed_team_size, + int &suggested_vector_size_, //output + int &suggested_team_size_, //output + size_type nr, size_type nnz){ + if (this->suggested_team_size && this->suggested_vector_size) { + suggested_vector_size_ = this->suggested_vector_size; + suggested_team_size_ = this->suggested_team_size; + return; + } + else { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + KokkosKernels::Impl::get_suggested_vector_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_, nr, nnz); +#else + KokkosKernels::Impl::get_suggested_vector_size(suggested_vector_size_, nr, nnz); + KokkosKernels::Impl::get_suggested_team_size(max_allowed_team_size, suggested_vector_size_, suggested_team_size_); +#endif + this->suggested_team_size = suggested_vector_size_; + this->suggested_vector_size = suggested_vector_size_; + + } + } + }; + + template + class PointGaussSeidelHandle + : public GaussSeidelHandle + { + public: + typedef GaussSeidelHandle GSHandle; + typedef ExecutionSpace HandleExecSpace; + typedef TemporaryMemorySpace HandleTempMemorySpace; + typedef PersistentMemorySpace HandlePersistentMemorySpace; + + typedef typename std::remove_const::type size_type; + typedef const size_type const_size_type; + + typedef typename std::remove_const::type nnz_lno_t; + typedef const nnz_lno_t const_nnz_lno_t; + + typedef typename std::remove_const::type nnz_scalar_t; + typedef const nnz_scalar_t const_nnz_scalar_t; + + + typedef typename Kokkos::View row_lno_temp_work_view_t; + typedef typename Kokkos::View row_lno_persistent_work_view_t; + typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type + + typedef typename Kokkos::View scalar_temp_work_view_t; + typedef typename Kokkos::View scalar_persistent_work_view_t; + typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; //Host view type + + typedef typename Kokkos::View nnz_lno_temp_work_view_t; + typedef typename Kokkos::View nnz_lno_persistent_work_view_t; + typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type + + private: row_lno_persistent_work_view_t permuted_xadj; nnz_lno_persistent_work_view_t permuted_adj; scalar_persistent_work_view_t permuted_adj_vals; nnz_lno_persistent_work_view_t old_to_new_map; - bool called_symbolic; - bool called_numeric; - - scalar_persistent_work_view_t permuted_y_vector; scalar_persistent_work_view_t permuted_x_vector; - int suggested_vector_size; - int suggested_team_size; - scalar_persistent_work_view_t permuted_inverse_diagonal; nnz_lno_t block_size; //this is for block sgs - nnz_lno_t max_nnz_input_row, num_values_in_l1, num_values_in_l2, num_big_rows; + nnz_lno_t max_nnz_input_row; + + nnz_lno_t num_values_in_l1, num_values_in_l2, num_big_rows; size_t level_1_mem, level_2_mem; + bool owner_of_coloring; public: /** * \brief Default constructor. */ - GaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT): - owner_of_coloring(false), - algorithm_type(gs), - color_set_xadj(), color_sets(), numColors(0), - permuted_xadj(), permuted_adj(), permuted_adj_vals(), old_to_new_map(), - called_symbolic(false), called_numeric(false), permuted_y_vector(), permuted_x_vector(), - suggested_vector_size(0), suggested_team_size(0), permuted_inverse_diagonal(), block_size(1), max_nnz_input_row(-1), - num_values_in_l1(-1), num_values_in_l2(-1),num_big_rows(0), level_1_mem(0), level_2_mem(0) + PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT) : + GSHandle(gs), + permuted_xadj(), permuted_adj(), permuted_adj_vals(), old_to_new_map(), + permuted_y_vector(), permuted_x_vector(), + permuted_inverse_diagonal(), block_size(1), + max_nnz_input_row(-1), + num_values_in_l1(-1), num_values_in_l2(-1),num_big_rows(0), level_1_mem(0), level_2_mem(0), + owner_of_coloring(false) { - if (gs == GS_DEFAULT){ + if (gs == GS_DEFAULT) this->choose_default_algorithm(); - } - - } + bool is_owner_of_coloring() const override {return this->owner_of_coloring;} + void set_owner_of_coloring(bool owner = true) {this->owner_of_coloring = owner;} + void set_block_size(nnz_lno_t bs){this->block_size = bs; } - nnz_lno_t get_block_size(){return this->block_size;} + nnz_lno_t get_block_size() const {return this->block_size;} /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. */ @@ -186,51 +324,28 @@ namespace KokkosSparse{ #endif } - virtual ~GaussSeidelHandle(){}; + ~PointGaussSeidelHandle() = default; //getters - GSAlgorithm get_algorithm_type() const {return this->algorithm_type;} - bool is_owner_of_coloring() const {return this->owner_of_coloring;} - - nnz_lno_persistent_work_host_view_t get_color_xadj() { - return this->color_set_xadj; - } - nnz_lno_persistent_work_view_t get_color_adj() { - return this->color_sets; - } - nnz_lno_t get_num_colors() { - return this->numColors; - } - - row_lno_persistent_work_view_t get_new_xadj() { + row_lno_persistent_work_view_t get_new_xadj() const { return this->permuted_xadj; } - nnz_lno_persistent_work_view_t get_new_adj() { + nnz_lno_persistent_work_view_t get_new_adj() const { return this->permuted_adj; } - scalar_persistent_work_view_t get_new_adj_val() { + scalar_persistent_work_view_t get_new_adj_val() const { return this->permuted_adj_vals; } - nnz_lno_persistent_work_view_t get_old_to_new_map() { + nnz_lno_persistent_work_view_t get_old_to_new_map() const { return this->old_to_new_map; } - bool is_symbolic_called(){return this->called_symbolic;} - bool is_numeric_called(){return this->called_numeric;} - //setters void set_algorithm_type(const GSAlgorithm &sgs_algo){this->algorithm_type = sgs_algo;} - void set_owner_of_coloring(bool owner = true){this->owner_of_coloring = owner;} void set_call_symbolic(bool call = true){this->called_symbolic = call;} void set_call_numeric(bool call = true){this->called_numeric = call;} - void set_color_set_xadj(const nnz_lno_persistent_work_host_view_t &color_set_xadj_) { - this->color_set_xadj = color_set_xadj_; - } - void set_color_set_adj(const nnz_lno_persistent_work_view_t &color_sets_) { - this->color_sets = color_sets_; - } void set_num_colors(const nnz_lno_t &numColors_) { this->numColors = numColors_; } @@ -251,7 +366,7 @@ namespace KokkosSparse{ this->permuted_inverse_diagonal = permuted_inverse_diagonal_; } - scalar_persistent_work_view_t get_permuted_inverse_diagonal (){ + scalar_persistent_work_view_t get_permuted_inverse_diagonal() const { return this->permuted_inverse_diagonal; } @@ -277,31 +392,30 @@ namespace KokkosSparse{ size_t get_level_1_mem() const { return this->level_1_mem; } - size_t get_level_2_mem()const { + size_t get_level_2_mem() const { return this->level_2_mem; } - nnz_lno_t get_num_values_in_l1()const { + nnz_lno_t get_num_values_in_l1() const { return this->num_values_in_l1 ; } - nnz_lno_t get_num_values_in_l2()const { - return this->num_values_in_l2 ; + nnz_lno_t get_num_values_in_l2() const { + return this->num_values_in_l2; } - nnz_lno_t get_num_big_rows()const { + nnz_lno_t get_num_big_rows() const { return this->num_big_rows; } - - nnz_lno_t get_max_nnz() const{ - return this->max_nnz_input_row ; + nnz_lno_t get_max_nnz() const { + if(max_nnz_input_row == static_cast(-1)) + throw std::runtime_error("Requested max nnz per input row, but this has not been set in the PointGS handle."); + return this->max_nnz_input_row; } - - void set_max_nnz(nnz_lno_t num_result_nnz_){ + void set_max_nnz(nnz_lno_t num_result_nnz_) { this->max_nnz_input_row = num_result_nnz_; } - void allocate_x_y_vectors(nnz_lno_t num_rows, nnz_lno_t num_cols){ if(permuted_y_vector.extent(0) != size_t(num_rows)){ permuted_y_vector = scalar_persistent_work_view_t("PERMUTED Y VECTOR", num_rows); @@ -311,8 +425,8 @@ namespace KokkosSparse{ } } - scalar_persistent_work_view_t get_permuted_y_vector (){return this->permuted_y_vector;} - scalar_persistent_work_view_t get_permuted_x_vector (){return this->permuted_x_vector;} + scalar_persistent_work_view_t get_permuted_y_vector() const {return this->permuted_y_vector;} + scalar_persistent_work_view_t get_permuted_x_vector() const {return this->permuted_x_vector;} void vector_team_size( int max_allowed_team_size, int &suggested_vector_size_, @@ -339,8 +453,150 @@ namespace KokkosSparse{ } } + }; + + template + class ClusterGaussSeidelHandle + : public GaussSeidelHandle + { + public: + typedef GaussSeidelHandle GSHandle; + typedef ExecutionSpace HandleExecSpace; + typedef TemporaryMemorySpace HandleTempMemorySpace; + typedef PersistentMemorySpace HandlePersistentMemorySpace; + + typedef typename std::remove_const::type size_type; + typedef const size_type const_size_type; + + typedef typename std::remove_const::type nnz_lno_t; + typedef const nnz_lno_t const_nnz_lno_t; + + typedef typename std::remove_const::type nnz_scalar_t; + typedef const nnz_scalar_t const_nnz_scalar_t; + + typedef typename Kokkos::View row_lno_temp_work_view_t; + typedef typename Kokkos::View row_lno_persistent_work_view_t; + typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type + + typedef typename Kokkos::View scalar_temp_work_view_t; + typedef typename Kokkos::View scalar_persistent_work_view_t; + typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; //Host view type + + typedef typename Kokkos::View nnz_lno_temp_work_view_t; + typedef typename Kokkos::View nnz_lno_persistent_work_view_t; + typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type + + private: + + ClusteringAlgorithm cluster_algo; + //This is the user-configurable target cluster size. + //Some clusters may be slightly larger or smaller, + //but cluster_xadj always has the exact size of each. + nnz_lno_t cluster_size; + + int suggested_vector_size; + int suggested_team_size; + + scalar_persistent_work_view_t inverse_diagonal; + + //cluster_xadj and cluster_adj encode the vertices in each cluster + nnz_lno_persistent_work_view_t cluster_xadj; + nnz_lno_persistent_work_view_t cluster_adj; + //vert_clusters(i) is the cluster that vertex i belongs to + nnz_lno_persistent_work_view_t vert_clusters; + + public: + + /** + * \brief Default constructor. + */ + + //Constructor for cluster-coloring based GS and SGS + ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, nnz_lno_t cluster_size_) + : GSHandle(GS_CLUSTER), cluster_algo(cluster_algo_), cluster_size(cluster_size_), + inverse_diagonal(), cluster_xadj(), cluster_adj(), vert_clusters() + {} + + void set_cluster_size(nnz_lno_t cs) {this->cluster_size = cs;} + nnz_lno_t get_cluster_size() const {return this->cluster_size;} + + void set_vert_clusters(nnz_lno_persistent_work_view_t& vert_clusters_) { + this->vert_clusters = vert_clusters_; + } + void set_cluster_xadj(nnz_lno_persistent_work_view_t& cluster_xadj_) { + this->cluster_xadj = cluster_xadj_; + } + void set_cluster_adj(nnz_lno_persistent_work_view_t& cluster_adj_) { + this->cluster_adj = cluster_adj_; + } + + nnz_lno_persistent_work_view_t get_vert_clusters() const { + if(!this->is_symbolic_called()) + throw std::runtime_error("vert_clusters does not exist until after symbolic setup."); + return vert_clusters; + } + + nnz_lno_persistent_work_view_t get_cluster_xadj() const { + if(!this->is_symbolic_called()) + throw std::runtime_error("cluster_xadj does not exist until after symbolic setup."); + return cluster_xadj; + } + + nnz_lno_persistent_work_view_t get_cluster_adj() const { + if(!this->is_symbolic_called()) + throw std::runtime_error("cluster_adj does not exist until after symbolic setup."); + return cluster_adj; + } + + void set_inverse_diagonal(scalar_persistent_work_view_t& inv_diag) { + this->inverse_diagonal = inv_diag; + } + + scalar_persistent_work_view_t get_inverse_diagonal() const { + if(!this->is_symbolic_called()) + throw std::runtime_error("inverse diagonal does not exist until after numeric setup."); + return inverse_diagonal; + } + + bool use_teams() const + { +#if defined( KOKKOS_ENABLE_SERIAL ) + if (Kokkos::Impl::is_same< Kokkos::Serial , ExecutionSpace >::value) { + return false; + } +#endif +#if defined( KOKKOS_ENABLE_THREADS ) + if (Kokkos::Impl::is_same< Kokkos::Threads , ExecutionSpace >::value){ + return false; + } +#endif +#if defined( KOKKOS_ENABLE_OPENMP ) + if (Kokkos::Impl::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ + return false; + } +#endif +#if defined( KOKKOS_ENABLE_CUDA ) + if (Kokkos::Impl::is_same::value){ + return true; + } +#endif +#if defined( KOKKOS_ENABLE_QTHREAD) + if (Kokkos::Impl::is_same< Kokkos::Qthread, ExecutionSpace >::value){ + return false; + } +#endif + return false; + } + + ~ClusterGaussSeidelHandle() = default; + + ClusteringAlgorithm get_clustering_algo() const {return this->cluster_algo;} }; } #endif + diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp new file mode 100644 index 0000000000..782f8247ab --- /dev/null +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -0,0 +1,1186 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSCGSIMP_HPP +#define _KOKKOSCGSIMP_HPP + +#include "KokkosKernels_Utils.hpp" +#include +#include +#include +#include +#include +#include +#include "KokkosGraph_Distance1Color.hpp" +#include "KokkosKernels_BitUtils.hpp" +#include "KokkosKernels_SimpleUtils.hpp" +#include "KokkosSparse_partitioning_impl.hpp" + +namespace KokkosSparse{ + namespace Impl{ + + template + class ClusterGaussSeidel{ + + public: + + typedef lno_row_view_t_ in_lno_row_view_t; + typedef lno_nnz_view_t_ in_lno_nnz_view_t; + typedef scalar_nnz_view_t_ in_scalar_nnz_view_t; + + typedef typename HandleType::HandleExecSpace MyExecSpace; + typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + + + typedef typename in_lno_row_view_t::non_const_value_type row_lno_t; + + typedef typename HandleType::size_type size_type; + typedef typename HandleType::nnz_lno_t nnz_lno_t; + typedef typename HandleType::nnz_scalar_t nnz_scalar_t; + + + typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; + typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t; + + typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t; + typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t; + + typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t; + typedef typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t; + + typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; + typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; + typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type + + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type + + typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t; + typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; + + typedef Kokkos::RangePolicy my_exec_space; + typedef nnz_lno_t color_t; + typedef Kokkos::View color_view_t; + typedef Kokkos::Bitset bitset_t; + typedef Kokkos::ConstBitset const_bitset_t; + + typedef Kokkos::TeamPolicy team_policy_t ; + typedef typename team_policy_t::member_type team_member_t ; + + private: + HandleType *handle; + + //Get the specialized ClusterGaussSeidel handle from the main handle + typename HandleType::ClusterGaussSeidelHandleType* get_gs_handle() + { + auto *gsHandle = dynamic_cast(this->handle->get_gs_handle()); + if(!gsHandle) + { + throw std::runtime_error("ClusterGaussSeidel: GS handle has not been created, or is set up for Point GS."); + } + return gsHandle; + } + + nnz_lno_t num_rows, num_cols; + + const_lno_row_view_t row_map; + const_lno_nnz_view_t entries; + const_scalar_nnz_view_t values; + + const_scalar_nnz_view_t given_inverse_diagonal; + + bool have_diagonal_given; + bool is_symmetric; + + public: + + struct PSGS_ForwardTag {}; + struct PSGS_BackwardTag {}; + + template + struct PSGS + { + // CSR storage of the matrix. + const_lno_row_view_t _xadj; + const_lno_nnz_view_t _adj; + const_scalar_nnz_view_t _adj_vals; + + //Input/output vectors, as in Ax = y + x_value_array_type _Xvector; + y_value_array_type _Yvector; + nnz_lno_persistent_work_view_t _color_adj; + nnz_lno_persistent_work_view_t _cluster_offsets; + nnz_lno_persistent_work_view_t _cluster_verts; + scalar_persistent_work_view_t _inverse_diagonal; + nnz_scalar_t _omega; + + nnz_lno_t _color_set_begin; + nnz_lno_t _color_set_end; + bool _forward_direction; + + PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, const_scalar_nnz_view_t adj_vals_, + x_value_array_type Xvector_, y_value_array_type Yvector_, + nnz_lno_persistent_work_view_t color_adj_, + nnz_lno_persistent_work_view_t cluster_offsets_, nnz_lno_persistent_work_view_t cluster_verts_, + nnz_scalar_t omega_, + scalar_persistent_work_view_t inverse_diagonal_) + : + _xadj (xadj_), + _adj (adj_), + _adj_vals (adj_vals_), + _Xvector (Xvector_), + _Yvector (Yvector_), + _color_adj (color_adj_), + _cluster_offsets (cluster_offsets_), + _cluster_verts (cluster_verts_), + _inverse_diagonal (inverse_diagonal_), + _omega (omega_), + _color_set_begin (0), + _color_set_end (0), + _forward_direction(true) + {} + + KOKKOS_FORCEINLINE_FUNCTION + void rowApply(const nnz_lno_t row) const + { + size_type row_begin = _xadj(row); + size_type row_end = _xadj(row + 1); + nnz_scalar_t sum = _Yvector(row); + for (size_type adjind = row_begin; adjind < row_end; ++adjind) + { + nnz_lno_t col = _adj(adjind); + nnz_scalar_t val = _adj_vals(adjind); + sum -= val * _Xvector(col); + } + _Xvector(row) += _omega * sum * _inverse_diagonal(row); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const PSGS_ForwardTag, const nnz_lno_t ii) const { + //color_adj(ii) is a cluster in the current color set. + nnz_lno_t clusterColorsetIndex = _color_set_begin + ii; + nnz_lno_t cluster = _color_adj(clusterColorsetIndex); + for(nnz_lno_t j = _cluster_offsets(cluster); j < _cluster_offsets(cluster + 1); j++) + { + rowApply(_cluster_verts(j)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const PSGS_BackwardTag, const nnz_lno_t ii) const { + //color_adj(ii) is a cluster in the current color set. + nnz_lno_t clusterColorsetIndex = _color_set_end - 1 - ii; + nnz_lno_t cluster = _color_adj(clusterColorsetIndex); + for(nnz_lno_t j = _cluster_offsets(cluster + 1); j > _cluster_offsets(cluster); j--) + { + rowApply(_cluster_verts(j - 1)); + } + } + }; + + template + struct Team_PSGS + { + //CSR storage of the matrix + const_lno_row_view_t _xadj; + const_lno_nnz_view_t _adj; + const_scalar_nnz_view_t _adj_vals; + + //X,Y vectors, as in Ax = y + x_value_array_type _Xvector; + y_value_array_type _Yvector; + nnz_lno_t _color_set_begin; + nnz_lno_t _color_set_end; + nnz_lno_persistent_work_view_t _color_adj; + nnz_lno_persistent_work_view_t _cluster_offsets; + nnz_lno_persistent_work_view_t _cluster_verts; + + //_clusters_per_team tries to reach the same total work per + //team by dividing the handle's heuristic get_ + nnz_lno_t _clusters_per_team; + + scalar_persistent_work_view_t _inverse_diagonal; + + bool _is_backward; + + nnz_scalar_t _omega; + + Team_PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, const_scalar_nnz_view_t adj_vals_, + x_value_array_type Xvector_, y_value_array_type Yvector_, + nnz_lno_t color_set_begin_, nnz_lno_t color_set_end_, + nnz_lno_persistent_work_view_t color_adj_, + nnz_lno_persistent_work_view_t cluster_offsets_, + nnz_lno_persistent_work_view_t cluster_verts_, + scalar_persistent_work_view_t inverse_diagonal_, + nnz_lno_t clusters_per_team_, + nnz_scalar_t omega_ = Kokkos::Details::ArithTraits::one()) : + _xadj( xadj_), + _adj( adj_), + _adj_vals( adj_vals_), + _Xvector(Xvector_), + _Yvector(Yvector_), + _color_set_begin(color_set_begin_), + _color_set_end(color_set_end_), + _color_adj(color_adj_), + _cluster_offsets(cluster_offsets_), + _cluster_verts(cluster_verts_), + _clusters_per_team(clusters_per_team_), + _inverse_diagonal(inverse_diagonal_), + _is_backward(false), + _omega(omega_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member_t& teamMember) const + { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, _clusters_per_team), + [&](const nnz_lno_t work) + { + nnz_lno_t ii = _color_set_begin + (teamMember.league_rank() * _clusters_per_team) + work; + if (ii >= _color_set_end) + return; + nnz_lno_t cluster = _color_adj(ii); + for(nnz_lno_t j = _cluster_offsets(cluster); j < _cluster_offsets(cluster + 1); j++) + { + nnz_lno_t row = _cluster_verts(j); + size_type row_begin = _xadj(row); + size_type row_end = _xadj(row + 1); + nnz_scalar_t dotprod = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), + [&] (size_type i, nnz_scalar_t& lsum) + { + size_type adjind = i + row_begin; + nnz_lno_t colIndex = _adj(adjind); + nnz_scalar_t val = _adj_vals(adjind); + lsum += val * _Xvector(colIndex); + }, dotprod); + Kokkos::single(Kokkos::PerThread(teamMember),[=] () { + _Xvector(row) += _omega * (_Yvector(row) - dotprod) * _inverse_diagonal(row); + }); + } + }); + } + }; + + /** + * \brief constructor + */ + + ClusterGaussSeidel(HandleType *handle_, + nnz_lno_t num_rows_, + nnz_lno_t num_cols_, + const_lno_row_view_t row_map_, + const_lno_nnz_view_t entries_, + const_scalar_nnz_view_t values_): + handle(handle_), num_rows(num_rows_), num_cols(num_cols_), + row_map(row_map_), entries(entries_), values(values_), + have_diagonal_given(false), + is_symmetric(true) + {} + + ClusterGaussSeidel(HandleType *handle_, + nnz_lno_t num_rows_, + nnz_lno_t num_cols_, + const_lno_row_view_t row_map_, + const_lno_nnz_view_t entries_, + bool is_symmetric_ = true): + handle(handle_), + num_rows(num_rows_), num_cols(num_cols_), + row_map(row_map_), + entries(entries_), + values(), + have_diagonal_given(false), + is_symmetric(is_symmetric_) + {} + + /** + * \brief constructor + */ + ClusterGaussSeidel(HandleType *handle_, + nnz_lno_t num_rows_, + nnz_lno_t num_cols_, + const_lno_row_view_t row_map_, + const_lno_nnz_view_t entries_, + const_scalar_nnz_view_t values_, + bool is_symmetric_): + handle(handle_), + num_rows(num_rows_), num_cols(num_cols_), + row_map(row_map_), entries(entries_), values(values_), + have_diagonal_given(false), + is_symmetric(is_symmetric_) + {} + + ClusterGaussSeidel( + HandleType *handle_, + nnz_lno_t num_rows_, + nnz_lno_t num_cols_, + const_lno_row_view_t row_map_, + const_lno_nnz_view_t entries_, + const_scalar_nnz_view_t values_, + const_scalar_nnz_view_t given_inverse_diagonal_, + bool is_symmetric_): + handle(handle_), + num_rows(num_rows_), num_cols(num_cols_), + row_map(row_map_), entries(entries_), values(values_), + given_inverse_diagonal(given_inverse_diagonal_), + have_diagonal_given(true), + is_symmetric(is_symmetric_) + {} + + //Functor to swap the numbers of two colors, + //so that the last cluster has the last color. + //Except, doesn't touch the color of the last cluster, + //since that value is needed the entire time this is running. + struct ClusterColorRelabelFunctor + { + typedef typename HandleType::GraphColoringHandleType GCHandle; + typedef typename GCHandle::color_view_t ColorView; + typedef Kokkos::View RowmapView; + typedef Kokkos::View EntriesView; + ClusterColorRelabelFunctor(ColorView& colors_, color_t numClusterColors_, nnz_lno_t numClusters_) + : colors(colors_), numClusterColors(numClusterColors_), numClusters(numClusters_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const + { + if(colors(i) == numClusterColors) + colors(i) = colors(numClusters - 1); + else if(colors(i) == colors(numClusters - 1)) + colors(i) = numClusterColors; + } + + ColorView colors; + color_t numClusterColors; + nnz_lno_t numClusters; + }; + + //Relabel the last cluster, after running ClusterColorRelabelFunctor. + //Call with a one-element range policy. + struct RelabelLastColorFunctor + { + typedef typename HandleType::GraphColoringHandleType GCHandle; + typedef typename GCHandle::color_view_t ColorView; + + RelabelLastColorFunctor(ColorView& colors_, color_t numClusterColors_, nnz_lno_t numClusters_) + : colors(colors_), numClusterColors(numClusterColors_), numClusters(numClusters_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_type) const + { + colors(numClusters - 1) = numClusterColors; + } + + ColorView colors; + color_t numClusterColors; + nnz_lno_t numClusters; + }; + + struct ClusterToVertexColoring + { + typedef typename HandleType::GraphColoringHandleType GCHandle; + typedef typename GCHandle::color_view_t ColorView; + + ClusterToVertexColoring(ColorView& clusterColors_, ColorView& vertexColors_, nnz_lno_t numRows_, nnz_lno_t numClusters_, nnz_lno_t clusterSize_) + : clusterColors(clusterColors_), vertexColors(vertexColors_), numRows(numRows_), numClusters(numClusters_), clusterSize(clusterSize_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const + { + size_type cluster = i / clusterSize; + size_type clusterOffset = i - cluster * clusterSize; + vertexColors(i) = ((clusterColors(cluster) - 1) * clusterSize) + clusterOffset + 1; + } + + ColorView clusterColors; + ColorView vertexColors; + nnz_lno_t numRows; + nnz_lno_t numClusters; + nnz_lno_t clusterSize; + }; + + template + struct ClusterSizeFunctor + { + ClusterSizeFunctor(nnz_view_t& counts_, nnz_view_t& vertClusters_) + : counts(counts_), vertClusters(vertClusters_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const + { + Kokkos::atomic_increment(&counts(vertClusters(i))); + } + nnz_view_t counts; + nnz_view_t vertClusters; + }; + + template + struct FillClusterVertsFunctor + { + FillClusterVertsFunctor(nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, nnz_view_t& insertCounts_) + : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const + { + nnz_lno_t cluster = vertClusters(i); + nnz_lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); + clusterVerts(offset) = i; + } + nnz_view_t clusterOffsets; + nnz_view_t clusterVerts; + nnz_view_t vertClusters; + nnz_view_t insertCounts; + }; + + template + struct BuildCrossClusterMaskFunctor + { + BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_) + : rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) + {} + + //Used a fixed-size hash set in shared memory + KOKKOS_INLINE_FUNCTION constexpr int tableSize() const + { + //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and. + return 512; + } + + //Given a cluster index, get the hash table index. + //This is the 32-bit xorshift RNG, but it works as a hash function. + KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(nnz_lno_t cluster) const + { + unsigned x = cluster; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x; + } + + KOKKOS_INLINE_FUNCTION bool lookup(nnz_lno_t cluster, int* table) const + { + unsigned h = xorshiftHash(cluster); + for(unsigned i = h; i < h + 2; i++) + { + if(table[i % tableSize()] == cluster) + return true; + } + return false; + } + + //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster) + //by inserting nei into the table. + KOKKOS_INLINE_FUNCTION bool insert(nnz_lno_t cluster, nnz_lno_t nei, int* table) const + { + unsigned h = xorshiftHash(nei); + for(unsigned i = h; i < h + 2; i++) + { + if(Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) + return true; + } + return false; + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const + { + nnz_lno_t cluster = t.league_rank(); + nnz_lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster); + //Use a fixed-size hash table per thread to accumulate neighbor of the cluster. + //If it fills up (very unlikely) then just count every remaining edge going to another cluster + //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph. + //each thread handles a cluster + int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int)); + //mark every entry as cluster (self-loop) to represent free/empty + Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), + [&](const nnz_lno_t i) + { + table[i] = cluster; + }); + t.team_barrier(); + //now, for each row belonging to the cluster, iterate through the neighbors + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), + [&] (const nnz_lno_t i) + { + nnz_lno_t row = clusterVerts(clusterOffsets(cluster) + i); + nnz_lno_t rowDeg = rowmap(row + 1) - rowmap(row); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), + [&] (const nnz_lno_t j) + { + nnz_lno_t nei = colinds(rowmap(row) + j); + nnz_lno_t neiCluster = vertClusters(nei); + if(neiCluster != cluster) + { + //Have a neighbor. Try to find it in the table. + if(!lookup(neiCluster, table)) + { + //Not in the table. Try to insert it. + insert(cluster, neiCluster, table); + //Whether or not insertion succeeded, + //this is a cross-cluster edge possibly not seen before + mask.set(rowmap(row) + j); + } + } + }); + }); + } + + size_t team_shmem_size(int teamSize) const + { + return tableSize() * sizeof(int); + } + + Rowmap rowmap; + Colinds colinds; + nnz_view_t clusterOffsets; + nnz_view_t clusterVerts; + nnz_view_t vertClusters; + bitset_t mask; + }; + + template + struct FillClusterEntriesFunctor + { + FillClusterEntriesFunctor( + Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterRowmap_, nnz_view_t& clusterEntries_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& edgeMask_) + : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_) + {} + //Run this scan over entries in clusterVerts (reordered point rows) + KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i, nnz_lno_t& lcount, const bool& finalPass) const + { + nnz_lno_t numRows = rowmap.extent(0) - 1; + nnz_lno_t row = clusterVerts(i); + size_type rowStart = rowmap(row); + size_type rowEnd = rowmap(row + 1); + nnz_lno_t cluster = vertClusters(row); + nnz_lno_t clusterStart = clusterOffsets(cluster); + //Count the number of entries in this row. + //This is how much lcount will be increased by, + //yielding the offset corresponding to + //these point entries in the cluster entries. + nnz_lno_t rowEntries = 0; + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + rowEntries++; + } + if(finalPass) + { + //if this is the last row in the cluster, update the upper bound in clusterRowmap + if(i == clusterStart) + { + clusterRowmap(cluster) = lcount; + } + nnz_lno_t clusterEdge = lcount; + //populate clusterEntries for these edges + for(size_type j = rowStart; j < rowEnd; j++) + { + if(edgeMask.test(j)) + { + clusterEntries(clusterEdge++) = vertClusters(colinds(j)); + } + } + } + //update the scan result at the end (exclusive) + lcount += rowEntries; + if(i == numRows - 1 && finalPass) + { + //on the very last row, set the last entry of the cluster rowmap + clusterRowmap(clusterRowmap.extent(0) - 1) = lcount; + } + } + Rowmap rowmap; + Colinds colinds; + nnz_view_t clusterRowmap; + nnz_view_t clusterEntries; + nnz_view_t clusterOffsets; + nnz_view_t clusterVerts; + nnz_view_t vertClusters; + const_bitset_t edgeMask; + }; + + //Assign cluster labels to vertices, given that the vertices are naturally + //ordered so that contiguous groups of vertices form decent clusters. + template + struct NopVertClusteringFunctor + { + NopVertClusteringFunctor(View& vertClusters_, nnz_lno_t clusterSize_) : + vertClusters(vertClusters_), + numRows(vertClusters.extent(0)), + clusterSize(clusterSize_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const + { + vertClusters(i) = i / clusterSize; + } + View vertClusters; + nnz_lno_t numRows; + nnz_lno_t clusterSize; + }; + + template + struct ReorderedClusteringFunctor + { + ReorderedClusteringFunctor(View& vertClusters_, View& ordering_, nnz_lno_t clusterSize_) : + vertClusters(vertClusters_), + ordering(ordering_), + numRows(vertClusters.extent(0)), + clusterSize(clusterSize_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const + { + vertClusters(i) = ordering(i) / clusterSize; + } + View vertClusters; + View ordering; + nnz_lno_t numRows; + nnz_lno_t clusterSize; + }; + + + void initialize_symbolic() + { + using nnz_view_t = nnz_lno_persistent_work_view_t; + using in_rowmap_t = const_lno_row_view_t; + using in_colinds_t = const_lno_nnz_view_t; + using rowmap_t = Kokkos::View; + using colinds_t = Kokkos::View; + using raw_rowmap_t = Kokkos::View>; + using raw_colinds_t = Kokkos::View>; + auto gsHandle = get_gs_handle(); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + Kokkos::Impl::Timer timer; +#endif + //sym_xadj/sym_adj is only used here for clustering. + //Create them as non-const, unmanaged views to avoid + //duplicating a bunch of code between the + //symmetric and non-symmetric input cases. + rowmap_t sym_xadj; + colinds_t sym_adj; + if(!this->is_symmetric) + { + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (num_rows, this->row_map, this->entries, sym_xadj, sym_adj); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl; + timer.reset(); +#endif + } + //Now that a symmetric graph is available, build the cluster graph (also symmetric) + nnz_lno_t clusterSize = gsHandle->get_cluster_size(); + nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize; + nnz_view_t clusterOffsets("Cluster offsets", numClusters + 1); + nnz_view_t clusterVerts("Cluster -> vertices", num_rows); + raw_rowmap_t raw_sym_xadj; + raw_colinds_t raw_sym_adj; + if(this->is_symmetric) + { + raw_sym_xadj = raw_rowmap_t(this->row_map.data(), this->row_map.extent(0)); + raw_sym_adj = raw_colinds_t(this->entries.data(), this->entries.extent(0)); + } + else + { + raw_sym_xadj = raw_rowmap_t(sym_xadj.data(), sym_xadj.extent(0)); + raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0)); + } + bool onCuda = false; +#ifdef KOKKOS_ENABLE_CUDA + onCuda = std::is_same::value; +#endif + nnz_view_t vertClusters; + auto clusterAlgo = gsHandle->get_clustering_algo(); + if(clusterAlgo == CLUSTER_DEFAULT) + { + //Use CM if > 50 entries per row, otherwise balloon clustering. + //CM is quite fast on CPUs if the level sets fan out quickly, otherwise slow and non-scalable. + if(!onCuda && (raw_sym_adj.extent(0) / num_rows > 50)) + clusterAlgo = CLUSTER_CUTHILL_MCKEE; + else + clusterAlgo = CLUSTER_BALLOON; + } + switch(clusterAlgo) + { + case CLUSTER_CUTHILL_MCKEE: + { + RCM rcm(num_rows, raw_sym_xadj, raw_sym_adj); + nnz_view_t cmOrder = rcm.cuthill_mckee(); + vertClusters = nnz_view_t("Cluster labels", num_rows); + Kokkos::parallel_for(my_exec_space(0, num_rows), ReorderedClusteringFunctor(vertClusters, cmOrder, clusterSize)); + break; + } + case CLUSTER_BALLOON: + { + BalloonClustering balloon(num_rows, raw_sym_xadj, raw_sym_adj); + vertClusters = balloon.run(clusterSize); + break; + } + case CLUSTER_DO_NOTHING: + { + vertClusters = nnz_view_t("Cluster labels", num_rows); + Kokkos::parallel_for(my_exec_space(0, num_rows), NopVertClusteringFunctor(vertClusters, clusterSize)); + break; + } + case CLUSTER_DEFAULT: + { + throw std::logic_error("Logic to choose default clustering algorithm is incorrect"); + } + default: + throw std::runtime_error("Clustering algo " + std::to_string((int) clusterAlgo) + " is not implemented"); + } +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "Graph clustering: " << timer.seconds() << '\n'; + timer.reset(); +#endif + //Construct the cluster offset and vertex array. These allow fast iteration over all vertices in a given cluster. + Kokkos::parallel_for(my_exec_space(0, num_rows), ClusterSizeFunctor(clusterOffsets, vertClusters)); + KokkosKernels::Impl::exclusive_parallel_prefix_sum(numClusters + 1, clusterOffsets); + { + nnz_view_t tempInsertCounts("Temporary cluster insert counts", numClusters); + Kokkos::parallel_for(my_exec_space(0, num_rows), FillClusterVertsFunctor(clusterOffsets, clusterVerts, vertClusters, tempInsertCounts)); + } +#if KOKKOSSPARSE_IMPL_PRINTDEBUG + { + auto clusterOffsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterOffsets); + auto clusterVertsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterVerts); + puts("Clusters (cluster #, and vertex #s):"); + for(nnz_lno_t i = 0; i < numClusters; i++) + { + printf("%d: ", (int) i); + for(nnz_lno_t j = clusterOffsetsHost(i); j < clusterOffsetsHost(i + 1); j++) + { + printf("%d ", (int) clusterVerts(j)); + } + putchar('\n'); + } + printf("\n\n\n"); + } +#endif + //Determine the set of edges (in the point graph) that cross between two distinct clusters + int vectorSize = this->handle->get_suggested_vector_size(num_rows, raw_sym_adj.extent(0)); + bitset_t crossClusterEdgeMask(raw_sym_adj.extent(0)); + size_type numClusterEdges; + { + BuildCrossClusterMaskFunctor + buildEdgeMask(raw_sym_xadj, raw_sym_adj, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask); + int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used. + int teamSize = KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); + Kokkos::parallel_for(team_policy_t(numClusters, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); + numClusterEdges = crossClusterEdgeMask.count(); + } + nnz_view_t clusterRowmap = nnz_view_t("Cluster graph rowmap", numClusters + 1); + nnz_view_t clusterEntries = nnz_view_t("Cluster graph colinds", numClusterEdges); + Kokkos::parallel_scan(my_exec_space(0, num_rows), FillClusterEntriesFunctor + (raw_sym_xadj, raw_sym_adj, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, vertClusters, crossClusterEdgeMask)); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n'; + timer.reset(); +#endif +#if KOKKOSSPARSE_IMPL_PRINTDEBUG + { + auto clusterRowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterRowmap); + auto clusterEntriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterEntries); + puts("Cluster graph (cluster #, and neighbors):"); + for(nnz_lno_t i = 0; i < numClusters; i++) + { + printf("%d: ", (int) i); + for(nnz_lno_t j = clusterRowmapHost(i); j < clusterRowmapHost(i + 1); j++) + { + printf("%d ", (int) clusterEntriesHost(j)); + } + putchar('\n'); + } + printf("\n\n\n"); + } +#endif + //Get the coloring of the cluster graph. + typename HandleType::GraphColoringHandleType::color_view_t colors; + color_t numColors; +#if KOKKOSSPARSE_IMPL_RUNSEQUENTIAL + numColors = numClusters; + std::cout << "SEQUENTIAL CGS: numColors = numClusters = " << numClusters << '\n'; + typename HandleType::GraphColoringHandleType::color_view_t::HostMirror h_colors = Kokkos::create_mirror_view(colors); + for(int i = 0; i < numClusters; ++i){ + h_colors(i) = i + 1; + } + Kokkos::deep_copy(colors, h_colors); +#else + //Create a handle that uses nnz_lno_t as the size_type, since the cluster graph should never be larger than 2^31 entries. + KokkosKernels::Experimental::KokkosKernelsHandle kh; + kh.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT); + KokkosGraph::Experimental::graph_color_symbolic(&kh, numClusters, numClusters, clusterRowmap, clusterEntries); + //retrieve colors + auto coloringHandle = kh.get_graph_coloring_handle(); + colors = coloringHandle->get_vertex_colors(); + numColors = coloringHandle->get_num_colors(); + kh.destroy_graph_coloring_handle(); +#endif +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "Coloring: " << timer.seconds() << '\n'; + timer.reset(); +#endif + nnz_lno_persistent_work_view_t color_xadj; + nnz_lno_persistent_work_view_t color_adj; + KokkosKernels::Impl::create_reverse_map + + (numClusters, numColors, colors, color_xadj, color_adj); + MyExecSpace().fence(); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; + timer.reset(); +#endif + nnz_lno_persistent_work_host_view_t color_xadj_host(Kokkos::ViewAllocateWithoutInitializing("Color xadj"), color_xadj.extent(0)); + Kokkos::deep_copy(color_xadj_host, color_xadj); + gsHandle->set_color_xadj(color_xadj_host); + gsHandle->set_color_adj(color_adj); + gsHandle->set_num_colors(numColors); + gsHandle->set_cluster_xadj(clusterOffsets); + gsHandle->set_cluster_adj(clusterVerts); + gsHandle->set_call_symbolic(true); + } + + struct Get_Matrix_Diagonals + { + const_lno_row_view_t _xadj; + const_lno_nnz_view_t _adj; // CSR storage of the graph. + const_scalar_nnz_view_t _adj_vals; // CSR storage of the graph. + scalar_persistent_work_view_t _diagonals; + + nnz_lno_t num_total_rows; + nnz_lno_t rows_per_team; + + nnz_scalar_t one; + + Get_Matrix_Diagonals( + const_lno_row_view_t xadj_, + const_lno_nnz_view_t adj_, + const_scalar_nnz_view_t adj_vals_, + scalar_persistent_work_view_t diagonals_, + nnz_lno_t num_total_rows_, + nnz_lno_t rows_per_team_) : + _xadj(xadj_), + _adj(adj_), + _adj_vals(adj_vals_), _diagonals(diagonals_), + num_total_rows(num_total_rows_), rows_per_team(rows_per_team_), + one(Kokkos::Details::ArithTraits::one()) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const nnz_lno_t row_id) const { + size_type row_begin = _xadj(row_id); + size_type row_end = _xadj(row_id + 1); + for(size_type j = row_begin; j < row_end; j++) + { + nnz_lno_t column_id = _adj(j); + if(column_id == row_id) + { + nnz_scalar_t val = _adj_vals(j); + _diagonals(row_id) = one / val; + break; + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member_t &team) const + { + const nnz_lno_t i_begin = team.league_rank() * rows_per_team; + const nnz_lno_t i_end = i_begin + rows_per_team <= num_total_rows ? i_begin + rows_per_team : num_total_rows; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,i_begin,i_end), + [&] (const nnz_lno_t row_id) + { + size_type row_begin = _xadj(row_id); + size_type row_end = _xadj(row_id + 1); + nnz_lno_t row_size = row_end - row_begin; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, row_size), + [&] (const nnz_lno_t col_ind) + { + size_type val_index = col_ind + row_begin; + nnz_lno_t column_id = _adj(val_index); + if(column_id == row_id) + { + _diagonals(row_id) = one / _adj_vals(val_index); + return; + } + }); + }); + } + }; + + void initialize_numeric() + { + auto gsHandle = get_gs_handle(); + if(!gsHandle->is_symbolic_called()) + { + this->initialize_symbolic(); + } + //Timer for whole numeric +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + Kokkos::Impl::Timer timer; +#endif + size_type nnz = this->entries.extent(0); + + int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); + int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); + + scalar_persistent_work_view_t inverse_diagonal(Kokkos::ViewAllocateWithoutInitializing("Aii^-1"), num_rows); + nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size, MyExecSpace::concurrency(), num_rows); + + if(have_diagonal_given) { + Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal); + } + else { + //extract inverse diagonal from matrix + Get_Matrix_Diagonals gmd( + this->row_map, this->entries, this->values, + inverse_diagonal, + num_rows, rows_per_team); + if(gsHandle->use_teams()) + { + Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", + team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); + } + else + { + Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals", + my_exec_space(0, num_rows), gmd); + } + } + gsHandle->set_inverse_diagonal(inverse_diagonal); + gsHandle->set_call_numeric(true); + MyExecSpace().fence(); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + std::cout << "NUMERIC:" << timer.seconds() << std::endl; +#endif + } + + template + void apply( + x_value_array_type x_lhs_output_vec, + y_value_array_type y_rhs_input_vec, + bool init_zero_x_vector = false, + int numIter = 1, + nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), + bool apply_forward = true, + bool apply_backward = true, + bool update_y_vector = true) + { + auto gsHandle = get_gs_handle(); + + size_type nnz = entries.extent(0); + nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); + nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj(); + + color_t numColors = gsHandle->get_num_colors(); + + if(init_zero_x_vector){ + KokkosKernels::Impl::zero_vector(num_cols, x_lhs_output_vec); + } + + scalar_persistent_work_view_t inverse_diagonal = gsHandle->get_inverse_diagonal(); + + if(gsHandle->use_teams()) + { + int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); + int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); + + nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size, MyExecSpace::concurrency(), num_rows); + //Get clusters per team. Round down to favor finer granularity, since this is sensitive to load imbalance + nnz_lno_t clusters_per_team = rows_per_team / gsHandle->get_cluster_size(); + if(clusters_per_team == 0) + clusters_per_team = 1; + + Team_PSGS gs( + this->row_map, this->entries, this->values, + x_lhs_output_vec, y_rhs_input_vec, + 0, 0, //color set range is set right before launch for each color set + color_adj, + gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), + inverse_diagonal, + clusters_per_team, + omega); + + this->IterativeTeamPSGS( + gs, + numColors, + h_color_xadj, + suggested_team_size, + suggested_vector_size, + numIter, + apply_forward, + apply_backward); + } + else + { + PSGS gs( + this->row_map, this->entries, this->values, + x_lhs_output_vec, y_rhs_input_vec, + color_adj, + gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), + omega, inverse_diagonal); + + this->IterativePSGS( + gs, + numColors, + h_color_xadj, + numIter, + apply_forward, + apply_backward); + } + MyExecSpace().fence(); + } + + template + void IterativeTeamPSGS( + TPSGS& gs, + color_t numColors, + nnz_lno_persistent_work_host_view_t h_color_xadj, + nnz_lno_t team_size, + nnz_lno_t vec_size, + int num_iteration, + bool apply_forward, + bool apply_backward) + { + for (int i = 0; i < num_iteration; ++i) + this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size, apply_forward, apply_backward); + } + + template + void DoTeamPSGS( + TPSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, + nnz_lno_t team_size, nnz_lno_t vec_size, + bool apply_forward, + bool apply_backward) + { + if (apply_forward) + { + gs._is_backward = false; + for (color_t i = 0; i < numColors; ++i){ + nnz_lno_t color_index_begin = h_color_xadj(i); + nnz_lno_t color_index_end = h_color_xadj(i + 1); + int overall_work = color_index_end - color_index_begin;// /256 + 1; + gs._color_set_begin = color_index_begin; + gs._color_set_end = color_index_end; + Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward", + team_policy_t((overall_work + gs._clusters_per_team - 1) / gs._clusters_per_team, team_size, vec_size), + gs); + MyExecSpace().fence(); + } + } + if (apply_backward) + { + gs._is_backward = true; + if (numColors > 0) + for (color_t i = numColors - 1; ; --i) { + nnz_lno_t color_index_begin = h_color_xadj(i); + nnz_lno_t color_index_end = h_color_xadj(i + 1); + nnz_lno_t overall_work = color_index_end - color_index_begin;// /256 + 1; + gs._color_set_begin = color_index_begin; + gs._color_set_end = color_index_end; + Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward", + team_policy_t((overall_work + gs._clusters_per_team - 1) / gs._clusters_per_team, team_size, vec_size), + gs); + MyExecSpace().fence(); + if (i == 0){ + break; + } + } + } + } + + template + void IterativePSGS( + PSGS& gs, + color_t numColors, + nnz_lno_persistent_work_host_view_t h_color_xadj, + int num_iteration, + bool apply_forward, + bool apply_backward) + { + for (int i = 0; i < num_iteration; ++i){ + this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward); + } + } + + template + void DoPSGS( + PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, + bool apply_forward, + bool apply_backward) + { + if (apply_forward){ + for (color_t i = 0; i < numColors; ++i){ + nnz_lno_t color_index_begin = h_color_xadj(i); + nnz_lno_t color_index_end = h_color_xadj(i + 1); + gs._color_set_begin = color_index_begin; + gs._color_set_end = color_index_end; + Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::forward", + Kokkos::RangePolicy + (0, color_index_end - color_index_begin), gs); + MyExecSpace().fence(); + } + } + if (apply_backward && numColors){ + for (size_type i = numColors - 1; ; --i){ + nnz_lno_t color_index_begin = h_color_xadj(i); + nnz_lno_t color_index_end = h_color_xadj(i + 1); + gs._color_set_begin = color_index_begin; + gs._color_set_end = color_index_end; + Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::backward", + Kokkos::RangePolicy + (0, color_index_end - color_index_begin), gs); + MyExecSpace().fence(); + if (i == 0){ + break; + } + } + } + } + }; //class ClusterGaussSeidel + } //namespace Impl +} //namespace KokkosSparse + +#endif + diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 7a8128575b..d408e93102 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -41,25 +41,26 @@ //@HEADER */ +#ifndef _KOKKOSGSIMP_HPP +#define _KOKKOSGSIMP_HPP + #include "KokkosKernels_Utils.hpp" #include #include #include +#include #include #include #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" -#ifndef _KOKKOSGSIMP_HPP -#define _KOKKOSGSIMP_HPP +#include "KokkosKernels_BitUtils.hpp" +#include "KokkosKernels_SimpleUtils.hpp" namespace KokkosSparse{ - - namespace Impl{ - template - class GaussSeidel{ + class PointGaussSeidel{ public: @@ -71,14 +72,12 @@ namespace KokkosSparse{ typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - typedef typename in_lno_row_view_t::non_const_value_type row_lno_t; typedef typename HandleType::size_type size_type; typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::nnz_scalar_t nnz_scalar_t; - typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t; @@ -88,26 +87,22 @@ namespace KokkosSparse{ typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t; typedef typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t; - - - typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type - - typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type - typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t; typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; typedef Kokkos::RangePolicy my_exec_space; typedef nnz_lno_t color_t; typedef Kokkos::View color_view_t; + typedef Kokkos::Bitset bitset_t; + typedef Kokkos::ConstBitset const_bitset_t; typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; @@ -121,6 +116,18 @@ namespace KokkosSparse{ private: HandleType *handle; + + //Get the specialized PointGaussSeidel handle from the main handle + typename HandleType::PointGaussSeidelHandleType* get_gs_handle() + { + auto gsHandle = dynamic_cast(this->handle->get_gs_handle()); + if(!gsHandle) + { + throw std::runtime_error("PointGaussSeidel: GS handle has not been created, or is set up for Cluster GS."); + } + return gsHandle; + } + nnz_lno_t num_rows, num_cols; const_lno_row_view_t row_map; @@ -157,20 +164,20 @@ namespace KokkosSparse{ omega(omega_){} KOKKOS_INLINE_FUNCTION - void operator()(const nnz_lno_t &ii) const { - - size_type row_begin = _xadj[ii]; - size_type row_end = _xadj[ii + 1]; + void operator()(const nnz_lno_t ii) const { + size_type row_begin = _xadj(ii); + size_type row_end = _xadj(ii + 1); - nnz_scalar_t sum = _Yvector[ii]; + nnz_scalar_t sum = _Yvector(ii); for (size_type adjind = row_begin; adjind < row_end; ++adjind){ - nnz_lno_t colIndex = _adj[adjind]; - nnz_scalar_t val = _adj_vals[adjind]; - sum -= val * _Xvector[colIndex]; + nnz_lno_t colIndex = _adj(adjind); + nnz_scalar_t val = _adj_vals(adjind); + sum -= val * _Xvector(colIndex); } - nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal[ii]; - _Xvector[ii] = _Xvector[ii] + omega * sum * invDiagonalVal; + nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(ii); + //std::cout << "Updating X(" << ii << ") by " << omega * sum * invDiagonalVal << '\n'; + _Xvector(ii) += omega * sum * invDiagonalVal; } }; @@ -199,7 +206,6 @@ namespace KokkosSparse{ nnz_scalar_t omega; - Team_PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_, scalar_persistent_work_view_t Xvector_, scalar_persistent_work_view_t Yvector_, nnz_lno_t color_set_begin, nnz_lno_t color_set_end, @@ -232,30 +238,27 @@ namespace KokkosSparse{ KOKKOS_INLINE_FUNCTION void operator()(const team_member_t & teamMember) const { - nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size()+ teamMember.team_rank() + _color_set_begin; if (ii >= _color_set_end) return; + size_type row_begin = _xadj(ii); + size_type row_end = _xadj(ii + 1); - - size_type row_begin = _xadj[ii]; - size_type row_end = _xadj[ii + 1]; - - nnz_scalar_t product = 0 ; + nnz_scalar_t product = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&] (size_type i, nnz_scalar_t & valueToUpdate) { size_type adjind = i + row_begin; - nnz_lno_t colIndex = _adj[adjind]; - nnz_scalar_t val = _adj_vals[adjind]; - valueToUpdate += val * _Xvector[colIndex]; + nnz_lno_t colIndex = _adj(adjind); + nnz_scalar_t val = _adj_vals(adjind); + valueToUpdate += val * _Xvector(colIndex); }, product); Kokkos::single(Kokkos::PerThread(teamMember),[=] () { - nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal[ii]; - _Xvector[ii] += omega * (_Yvector[ii] - product) * invDiagonalVal; + nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(ii); + _Xvector(ii) += omega * (_Yvector(ii) - product) * invDiagonalVal; }); } @@ -277,7 +280,7 @@ namespace KokkosSparse{ nnz_scalar_t *all_global_memory = NULL; - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& ii) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t ii) { Kokkos::parallel_for( @@ -286,8 +289,8 @@ namespace KokkosSparse{ diagonal_positions[i] = -1; }); - size_type row_begin = _xadj[ii]; - size_type row_end = _xadj[ii + 1]; + size_type row_begin = _xadj(ii); + size_type row_end = _xadj(ii + 1); nnz_lno_t row_size = row_end - row_begin; nnz_lno_t l1_val_size = row_size * block_size, l2_val_size = 0; @@ -314,7 +317,7 @@ namespace KokkosSparse{ if (colIndex == ii){ diagonal_positions[i % block_size] = i; } - all_shared_memory[i] = _Xvector[colIndex * block_size + i % block_size]; + all_shared_memory[i] = _Xvector(colIndex * block_size + i % block_size); }); //bring values to l2 vector. Kokkos::parallel_for( @@ -323,12 +326,12 @@ namespace KokkosSparse{ nnz_lno_t i = l1_val_size + k; size_type adjind = i / block_size + row_begin; - nnz_lno_t colIndex = _adj[adjind]; + nnz_lno_t colIndex = _adj(adjind); if (colIndex == ii){ diagonal_positions[i % block_size] = i; } - all_global_memory[k] = _Xvector[colIndex * block_size + i % block_size]; + all_global_memory[k] = _Xvector(colIndex * block_size + i % block_size); }); row_begin = row_begin * block_size * block_size; @@ -364,15 +367,15 @@ namespace KokkosSparse{ //update the new vector entries. Kokkos::single(Kokkos::PerThread(teamMember),[=] () { nnz_lno_t block_row_index = ii * block_size + i; - nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal[block_row_index]; - _Xvector[block_row_index] += omega * (_Yvector[block_row_index] - product) * invDiagonalVal; + nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); + _Xvector(block_row_index) += omega * (_Yvector(block_row_index) - product) * invDiagonalVal; //we need to update the values of the vector entries if they are already brought to shared memory to sync with global memory. if (diagonal_positions[i] != -1){ if (diagonal_positions[i] < l1_val_size) - all_shared_memory[diagonal_positions[i]] = _Xvector[block_row_index]; + all_shared_memory[diagonal_positions[i]] = _Xvector(block_row_index); else - all_global_memory[diagonal_positions[i] - l1_val_size] = _Xvector[block_row_index]; + all_global_memory[diagonal_positions[i] - l1_val_size] = _Xvector(block_row_index); } }); @@ -383,7 +386,7 @@ namespace KokkosSparse{ std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; for (int z = 0; z < int (row_size); ++z){ - std::cout << _adj[_xadj[ii] + z] << " "; + std::cout << _adj(_xadj(ii) + z) << " "; } std::cout <<"\n\nrow-0:X -- all-shared-memory:"; @@ -391,10 +394,10 @@ namespace KokkosSparse{ std::cout << all_shared_memory[z] << " "; } std::cout << std::endl << "product:" << product << std::endl; - std::cout << "diagonal" << _permuted_inverse_diagonal[ii * block_size + i] << std::endl; - std::cout << "_Yvector" << _Yvector[ii * block_size + i] << std::endl; + std::cout << "diagonal" << _permuted_inverse_diagonal(ii * block_size + i) << std::endl; + std::cout << "_Yvector" << _Yvector(ii * block_size + i) << std::endl; - std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector[block_row_index]:" << _Xvector[ii * block_size + i] << std::endl; + std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector(block_row_index):" << _Xvector(ii * block_size + i) << std::endl; } #endif } @@ -438,8 +441,8 @@ namespace KokkosSparse{ diagonal_positions[i] = -1; }); - size_type row_begin = _xadj[ii]; - size_type row_end = _xadj[ii + 1]; + size_type row_begin = _xadj(ii); + size_type row_end = _xadj(ii + 1); nnz_lno_t row_size = row_end - row_begin; Kokkos::parallel_for( @@ -448,12 +451,12 @@ namespace KokkosSparse{ size_type adjind = i / block_size + row_begin; - nnz_lno_t colIndex = _adj[adjind]; + nnz_lno_t colIndex = _adj(adjind); if (colIndex == ii){ diagonal_positions[i % block_size] = i; } - all_shared_memory[i] = _Xvector[colIndex * block_size + i % block_size]; + all_shared_memory[i] = _Xvector(colIndex * block_size + i % block_size); }); @@ -477,12 +480,12 @@ namespace KokkosSparse{ Kokkos::single(Kokkos::PerThread(teamMember),[=] () { nnz_lno_t block_row_index = ii * block_size + i; - nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal[block_row_index]; - _Xvector[block_row_index] += omega * (_Yvector[block_row_index] - product) * invDiagonalVal; + nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index); + _Xvector(block_row_index) += omega * (_Yvector(block_row_index) - product) * invDiagonalVal; if (diagonal_positions[i] != -1){ - all_shared_memory[diagonal_positions[i]] = _Xvector[block_row_index]; + all_shared_memory[diagonal_positions[i]] = _Xvector(block_row_index); } }); @@ -493,7 +496,7 @@ namespace KokkosSparse{ std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; for (int z = 0; z < int (row_size); ++z){ - std::cout << _adj[_xadj[ii] + z] << " "; + std::cout << _adj(_xadj(ii) + z) << " "; } std::cout <<"\n\nrow-0:X -- all-shared-memory:"; @@ -501,18 +504,16 @@ namespace KokkosSparse{ std::cout << all_shared_memory[z] << " "; } std::cout << std::endl << "product:" << product << std::endl; - std::cout << "diagonal" << _permuted_inverse_diagonal[ii * block_size + i] << std::endl; - std::cout << "_Yvector" << _Yvector[ii * block_size + i] << std::endl; + std::cout << "diagonal" << _permuted_inverse_diagonal(ii * block_size + i) << std::endl; + std::cout << "_Yvector" << _Yvector(ii * block_size + i) << std::endl; - std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector[block_row_index]:" << _Xvector[ii * block_size + i] << std::endl << std::endl<< std::endl; + std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector(block_row_index):" << _Xvector(ii * block_size + i) << std::endl << std::endl<< std::endl; } #endif #endif //row_begin += row_size * block_size; } }); - - } size_t team_shmem_size (int team_size) const { @@ -520,13 +521,11 @@ namespace KokkosSparse{ } }; - - /** * \brief constructor */ - GaussSeidel(HandleType *handle_, + PointGaussSeidel(HandleType *handle_, nnz_lno_t num_rows_, nnz_lno_t num_cols_, const_lno_row_view_t row_map_, @@ -538,7 +537,7 @@ namespace KokkosSparse{ is_symmetric(true){} - GaussSeidel(HandleType *handle_, + PointGaussSeidel(HandleType *handle_, nnz_lno_t num_rows_, nnz_lno_t num_cols_, const_lno_row_view_t row_map_, @@ -557,7 +556,7 @@ namespace KokkosSparse{ /** * \brief constructor */ - GaussSeidel(HandleType *handle_, + PointGaussSeidel(HandleType *handle_, nnz_lno_t num_rows_, nnz_lno_t num_cols_, const_lno_row_view_t row_map_, @@ -571,7 +570,7 @@ namespace KokkosSparse{ is_symmetric(is_symmetric_){} - GaussSeidel(HandleType *handle_, + PointGaussSeidel(HandleType *handle_, nnz_lno_t num_rows_, nnz_lno_t num_cols_, const_lno_row_view_t row_map_, @@ -586,22 +585,18 @@ namespace KokkosSparse{ have_diagonal_given(true), is_symmetric(is_symmetric_){} - - void initialize_symbolic() { + auto gsHandle = get_gs_handle(); typename HandleType::GraphColoringHandleType *gchandle = this->handle->get_graph_coloring_handle(); - if (gchandle == NULL) { - this->handle->create_graph_coloring_handle(); - //this->handle->create_gs_handle(); - this->handle->get_gs_handle()->set_owner_of_coloring(); - gchandle = this->handle->get_graph_coloring_handle(); + this->handle->create_graph_coloring_handle(); + gsHandle->set_owner_of_coloring(true); + gchandle = this->handle->get_graph_coloring_handle(); } - const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; size_type nnz = adj.extent(0); @@ -609,37 +604,38 @@ namespace KokkosSparse{ #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Impl::Timer timer; #endif - { - if (!is_symmetric){ - - if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB){ - - gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj); - KokkosGraph::Experimental::graph_color_symbolic - (this->handle, num_rows, num_rows, xadj , adj); - } - else { - row_lno_temp_work_view_t tmp_xadj; - nnz_lno_temp_work_view_t tmp_adj; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - < const_lno_row_view_t, const_lno_nnz_view_t, - row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, - MyExecSpace> - (num_rows, xadj, adj, tmp_xadj, tmp_adj ); - KokkosGraph::Experimental::graph_color_symbolic (this->handle, num_rows, num_rows, tmp_xadj , tmp_adj); - } + typename HandleType::GraphColoringHandleType::color_view_t colors; + color_t numColors; + if (!is_symmetric) { + if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) { + + gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj); + KokkosGraph::Experimental::graph_color_symbolic + (this->handle, num_rows, num_rows, xadj, adj); } else { - KokkosGraph::Experimental::graph_color_symbolic (this->handle, num_rows, num_rows, xadj , adj); + row_lno_temp_work_view_t tmp_xadj; + nnz_lno_temp_work_view_t tmp_adj; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + < const_lno_row_view_t, const_lno_nnz_view_t, + row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, + MyExecSpace> + (num_rows, xadj, adj, tmp_xadj, tmp_adj); + KokkosGraph::Experimental::graph_color_symbolic + (this->handle, num_rows, num_rows, tmp_xadj, tmp_adj); } } - color_t numColors = gchandle->get_num_colors(); + else { + KokkosGraph::Experimental::graph_color_symbolic + (this->handle, num_rows, num_rows, xadj, adj); + } + colors = gchandle->get_vertex_colors(); + numColors = gchandle->get_num_colors(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; + timer.reset(); #endif - - typename HandleType::GraphColoringHandleType::color_view_t colors = gchandle->get_vertex_colors(); #if KOKKOSSPARSE_IMPL_RUNSEQUENTIAL numColors = num_rows; KokkosKernels::Impl::print_1Dview(colors); @@ -651,21 +647,16 @@ namespace KokkosSparse{ Kokkos::deep_copy(colors, h_colors); #endif nnz_lno_persistent_work_view_t color_xadj; - nnz_lno_persistent_work_view_t color_adj; - - #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE timer.reset(); #endif - KokkosKernels::Impl::create_reverse_map (num_rows, numColors, colors, color_xadj, color_adj); MyExecSpace().fence(); - #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); @@ -700,8 +691,6 @@ namespace KokkosSparse{ } #endif - - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "SORT_TIME:" << timer.seconds() << std::endl; @@ -713,7 +702,7 @@ namespace KokkosSparse{ nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows ); nnz_lno_persistent_work_view_t permuted_adj ("newadj_", nnz ); - Kokkos::parallel_for( "KokkosSparse::GaussSeidel::create_permuted_xadj", my_exec_space(0,num_rows), + Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", my_exec_space(0,num_rows), create_permuted_xadj( color_adj, xadj, @@ -728,7 +717,6 @@ namespace KokkosSparse{ timer.reset(); #endif - KokkosKernels::Impl::inclusive_parallel_prefix_sum (num_rows + 1, permuted_xadj); @@ -740,7 +728,7 @@ namespace KokkosSparse{ #endif - Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_symbolic",my_exec_space(0,num_rows), + Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",my_exec_space(0,num_rows), fill_matrix_symbolic( num_rows, color_adj, @@ -758,129 +746,122 @@ namespace KokkosSparse{ timer.reset(); #endif - typename HandleType::GaussSeidelHandleType *gsHandler = this->handle->get_gs_handle(); - nnz_lno_t block_size = this->handle->get_gs_handle()->get_block_size(); + nnz_lno_t block_size = get_gs_handle()->get_block_size(); //MD: if block size is larger than 1; //the algorithm copies the vector entries into shared memory and reuses this small shared memory for vector entries. if (block_size > 1) - { - //first calculate max row size. - size_type max_row_size = 0; - KokkosKernels::Impl::kk_view_reduce_max_row_size(num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, max_row_size); - gsHandler->set_max_nnz(max_row_size); - - - nnz_lno_t brows = permuted_xadj.extent(0) - 1; - size_type bnnz = permuted_adj.extent(0) * block_size * block_size; - - int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); - int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); - size_t shmem_size_to_use = this->handle->get_shmem_size(); - //nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,MyExecSpace::concurrency(), brows); - - //MD: now we calculate how much memory is needed for shared memory. - //we have two-level vectors: as in spgemm hashmaps. - //we try to fit everything into shared memory. - //if they fit, we can use BlockTeam function in Team_SGS functor. - //on CPUs, we make L1 vector big enough so that it will always hold it. - //on GPUs, we have a upper bound for shared memory: handle->get_shmem_size(): this is set to 32128 bytes. - //If things do not fit into shared memory, we allocate vectors in global memory and run BigBlockTeam in Team_SGS functor. - size_t level_1_mem = max_row_size * block_size * sizeof(nnz_scalar_t) + ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t); - level_1_mem = suggested_team_size * level_1_mem; - size_t level_2_mem = 0; - nnz_lno_t num_values_in_l1 = max_row_size; - nnz_lno_t num_values_in_l2 = 0; - nnz_lno_t num_big_rows = 0; - - KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); - if (ex_sp != KokkosKernels::Impl::Exec_CUDA){ - //again, if it is on CPUs, we make L1 as big as we need. - size_t l1mem = 1; - while(l1mem < level_1_mem){ - l1mem *= 2; - } - gsHandler->set_level_1_mem(l1mem); - level_1_mem = l1mem; + { + //first calculate max row size. + size_type max_row_size = 0; + KokkosKernels::Impl::kk_view_reduce_max_row_size(num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, max_row_size); + gsHandle->set_max_nnz(max_row_size); + + nnz_lno_t brows = permuted_xadj.extent(0) - 1; + size_type bnnz = permuted_adj.extent(0) * block_size * block_size; + + int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); + size_t shmem_size_to_use = this->handle->get_shmem_size(); + //nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,MyExecSpace::concurrency(), brows); + + //MD: now we calculate how much memory is needed for shared memory. + //we have two-level vectors: as in spgemm hashmaps. + //we try to fit everything into shared memory. + //if they fit, we can use BlockTeam function in Team_SGS functor. + //on CPUs, we make L1 vector big enough so that it will always hold it. + //on GPUs, we have a upper bound for shared memory: handle->get_shmem_size(): this is set to 32128 bytes. + //If things do not fit into shared memory, we allocate vectors in global memory and run BigBlockTeam in Team_SGS functor. + size_t level_1_mem = max_row_size * block_size * sizeof(nnz_scalar_t) + ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t); + level_1_mem = suggested_team_size * level_1_mem; + size_t level_2_mem = 0; + nnz_lno_t num_values_in_l1 = max_row_size; + nnz_lno_t num_values_in_l2 = 0; + nnz_lno_t num_big_rows = 0; + + KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); + if (ex_sp != KokkosKernels::Impl::Exec_CUDA){ + //again, if it is on CPUs, we make L1 as big as we need. + size_t l1mem = 1; + while(l1mem < level_1_mem){ + l1mem *= 2; + } + gsHandle->set_level_1_mem(l1mem); + level_1_mem = l1mem; + level_2_mem = 0; + } + else { + //on GPUs set the L1 size to max shmem and calculate how much we need for L2. + //we try to shift with 8 always because of the errors we experience with memory shifts on GPUs. + level_1_mem = shmem_size_to_use; + num_values_in_l1 = (shmem_size_to_use / suggested_team_size - ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t)) / sizeof(nnz_scalar_t) / block_size; + if (((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t) > shmem_size_to_use / suggested_team_size ) throw "Shared memory size is to small for the given block size\n"; + if (num_values_in_l1 >= (nnz_lno_t) (max_row_size) ){ + num_values_in_l2 = 0; level_2_mem = 0; + num_big_rows = 0; } else { - //on GPUs set the L1 size to max shmem and calculate how much we need for L2. - //we try to shift with 8 always because of the errors we experience with memory shifts on GPUs. - level_1_mem = shmem_size_to_use; - num_values_in_l1 = (shmem_size_to_use / suggested_team_size - ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t)) / sizeof(nnz_scalar_t) / block_size; - if (((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t) > shmem_size_to_use / suggested_team_size ) throw "Shared memory size is to small for the given block size\n"; - if (num_values_in_l1 >= (nnz_lno_t) (max_row_size) ){ - num_values_in_l2 = 0; - level_2_mem = 0; - num_big_rows = 0; - } - else { - num_values_in_l2 = max_row_size - num_values_in_l1; - level_2_mem = num_values_in_l2 * block_size * sizeof(nnz_scalar_t); - //std::cout << "level_2_mem:" << level_2_mem << std::endl; - size_t l2mem = 1; - while(l2mem < level_2_mem){ - l2mem *= 2; - } - level_2_mem = l2mem; - //std::cout << "level_2_mem:" << level_2_mem << std::endl; + num_values_in_l2 = max_row_size - num_values_in_l1; + level_2_mem = num_values_in_l2 * block_size * sizeof(nnz_scalar_t); + //std::cout << "level_2_mem:" << level_2_mem << std::endl; + size_t l2mem = 1; + while(l2mem < level_2_mem){ + l2mem *= 2; + } + level_2_mem = l2mem; + //std::cout << "level_2_mem:" << level_2_mem << std::endl; - size_type num_large_rows = 0; - KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold(brows, permuted_xadj, num_values_in_l1, num_large_rows); - num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size)); - //std::cout << "num_big_rows:" << num_big_rows << std::endl; + size_type num_large_rows = 0; + KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold(brows, permuted_xadj, num_values_in_l1, num_large_rows); + num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size)); + //std::cout << "num_big_rows:" << num_big_rows << std::endl; #if defined( KOKKOS_ENABLE_CUDA ) - if (ex_sp == KokkosKernels::Impl::Exec_CUDA) { - //check if we have enough memory for this. lower the concurrency if we do not have enugh memory. - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_big_rows) * level_2_mem; - if (required_size + num_big_rows * sizeof(int) > free_byte){ - num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_big_rows) { - min_chunk_size *= 2; - } - num_big_rows = min_chunk_size; + if (ex_sp == KokkosKernels::Impl::Exec_CUDA) { + //check if we have enough memory for this. lower the concurrency if we do not have enugh memory. + size_t free_byte ; + size_t total_byte ; + cudaMemGetInfo( &free_byte, &total_byte ) ; + size_t required_size = size_t (num_big_rows) * level_2_mem; + if (required_size + num_big_rows * sizeof(int) > free_byte){ + num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 <= num_big_rows) { + min_chunk_size *= 2; } + num_big_rows = min_chunk_size; } -#endif } - +#endif } - - gsHandler->set_max_nnz(max_row_size); - gsHandler->set_level_1_mem(level_1_mem); - gsHandler->set_level_2_mem(level_2_mem); - - gsHandler->set_num_values_in_l1(num_values_in_l1); - gsHandler->set_num_values_in_l2(num_values_in_l2); - gsHandler->set_num_big_rows(num_big_rows); - } + gsHandle->set_max_nnz(max_row_size); + gsHandle->set_level_1_mem(level_1_mem); + gsHandle->set_level_2_mem(level_2_mem); - gsHandler->set_color_set_xadj(h_color_xadj); - gsHandler->set_color_set_adj(color_adj); - gsHandler->set_num_colors(numColors); - gsHandler->set_new_xadj(permuted_xadj); - gsHandler->set_new_adj(permuted_adj); - //gsHandler->set_new_adj_val(newvals_); - gsHandler->set_old_to_new_map(old_to_new_map); - if (this->handle->get_gs_handle()->is_owner_of_coloring()){ - this->handle->destroy_graph_coloring_handle(); - this->handle->get_gs_handle()->set_owner_of_coloring(false); + gsHandle->set_num_values_in_l1(num_values_in_l1); + gsHandle->set_num_values_in_l2(num_values_in_l2); + gsHandle->set_num_big_rows(num_big_rows); } - this->handle->get_gs_handle()->set_call_symbolic(true); + gsHandle->set_color_xadj(h_color_xadj); + gsHandle->set_color_adj(color_adj); + gsHandle->set_num_colors(numColors); + gsHandle->set_new_xadj(permuted_xadj); + gsHandle->set_new_adj(permuted_adj); + gsHandle->set_old_to_new_map(old_to_new_map); + if(gsHandle->is_owner_of_coloring()) { + this->handle->destroy_graph_coloring_handle(); + gsHandle->set_owner_of_coloring(false); + } + gsHandle->set_call_symbolic(true); - this->handle->get_gs_handle()->allocate_x_y_vectors(this->num_rows * block_size, this->num_cols * block_size); + gsHandle->allocate_x_y_vectors(this->num_rows * block_size, this->num_cols * block_size); //std::cout << "all end" << std::endl; #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; @@ -1080,8 +1061,8 @@ namespace KokkosSparse{ }; void initialize_numeric(){ - - if (this->handle->get_gs_handle()->is_symbolic_called() == false){ + auto gsHandle = this->get_gs_handle(); + if (gsHandle->is_symbolic_called() == false){ this->initialize_symbolic(); } //else @@ -1097,15 +1078,11 @@ namespace KokkosSparse{ size_type nnz = adj_vals.extent(0); - typename HandleType::GaussSeidelHandleType *gsHandler = this->handle->get_gs_handle(); - + row_lno_persistent_work_view_t newxadj_ = gsHandle->get_new_xadj(); + nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map(); + nnz_lno_persistent_work_view_t newadj_ = gsHandle->get_new_adj(); - - row_lno_persistent_work_view_t newxadj_ = gsHandler->get_new_xadj(); - nnz_lno_persistent_work_view_t old_to_new_map = gsHandler->get_old_to_new_map(); - nnz_lno_persistent_work_view_t newadj_ = gsHandler->get_new_adj(); - - nnz_lno_persistent_work_view_t color_adj = gsHandler->get_color_adj(); + nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); scalar_persistent_work_view_t permuted_adj_vals (Kokkos::ViewAllocateWithoutInitializing("newvals_"), nnz ); @@ -1113,9 +1090,8 @@ namespace KokkosSparse{ int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size,MyExecSpace::concurrency(), num_rows); - - nnz_lno_t block_size = this->handle->get_gs_handle()->get_block_size(); - nnz_lno_t block_matrix_size = block_size * block_size ; + nnz_lno_t block_size = gsHandle->get_block_size(); + nnz_lno_t block_matrix_size = block_size * block_size ; //MD NOTE: 03/27/2018: below fill matrix operations will work fine with block size 1. //If the block size is more than 1, below code assumes that the rows are sorted similar to point crs. @@ -1164,7 +1140,7 @@ namespace KokkosSparse{ )); } MyExecSpace().fence(); - gsHandler->set_new_adj_val(permuted_adj_vals); + gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal (Kokkos::ViewAllocateWithoutInitializing("permuted_inverse_diagonal"), num_rows * block_size ); if (!have_diagonal_given) { @@ -1211,10 +1187,8 @@ namespace KokkosSparse{ } MyExecSpace().fence(); - this->handle->get_gs_handle()->set_permuted_inverse_diagonal(permuted_inverse_diagonal); - - this->handle->get_gs_handle()->set_call_numeric(true); - + gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); + gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "NUMERIC:" << timer.seconds() << std::endl; @@ -1230,34 +1204,28 @@ namespace KokkosSparse{ nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, - bool update_y_vector = true){ - if (this->handle->get_gs_handle()->is_numeric_called() == false){ + bool update_y_vector = true) { + auto gsHandle = this->get_gs_handle(); + if(gsHandle->is_numeric_called() == false){ this->initialize_numeric(); } - typename HandleType::GaussSeidelHandleType *gsHandler = this->handle->get_gs_handle(); - - nnz_lno_t block_size = this->handle->get_gs_handle()->get_block_size(); + nnz_lno_t block_size = gsHandle->get_block_size(); //nnz_lno_t block_matrix_size = block_size * block_size ; + scalar_persistent_work_view_t Permuted_Yvector = gsHandle->get_permuted_y_vector(); + scalar_persistent_work_view_t Permuted_Xvector = gsHandle->get_permuted_x_vector(); - scalar_persistent_work_view_t Permuted_Yvector = gsHandler->get_permuted_y_vector(); - scalar_persistent_work_view_t Permuted_Xvector = gsHandler->get_permuted_x_vector(); - - - - row_lno_persistent_work_view_t newxadj_ = gsHandler->get_new_xadj(); - nnz_lno_persistent_work_view_t old_to_new_map = gsHandler->get_old_to_new_map(); - nnz_lno_persistent_work_view_t newadj_ = gsHandler->get_new_adj(); - nnz_lno_persistent_work_view_t color_adj = gsHandler->get_color_adj(); - - color_t numColors = gsHandler->get_num_colors(); - - - - if (update_y_vector){ + row_lno_persistent_work_view_t newxadj = gsHandle->get_new_xadj(); + nnz_lno_persistent_work_view_t newadj = gsHandle->get_new_adj(); + scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val(); + nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map(); + nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); + scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); + color_t numColors = gsHandle->get_num_colors(); + if (update_y_vector) { KokkosKernels::Impl::permute_block_vector (num_cols * block_size, Permuted_Xvector); } else{ KokkosKernels::Impl::permute_block_vector ( - num_cols, block_size, - old_to_new_map, - x_lhs_output_vec, - Permuted_Xvector - ); + num_cols, block_size, + old_to_new_map, + x_lhs_output_vec, + Permuted_Xvector + ); } MyExecSpace().fence(); - - - row_lno_persistent_work_view_t permuted_xadj = gsHandler->get_new_xadj(); - nnz_lno_persistent_work_view_t permuted_adj = gsHandler->get_new_adj(); - scalar_persistent_work_view_t permuted_adj_vals = gsHandler->get_new_adj_val(); - scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandler->get_permuted_inverse_diagonal(); - #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "Y:"; KokkosKernels::Impl::print_1Dview(Permuted_Yvector); @@ -1299,17 +1260,15 @@ namespace KokkosSparse{ std::cout << "X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); - std::cout << "permuted_xadj:"; KokkosKernels::Impl::print_1Dview(permuted_xadj); - std::cout << "permuted_adj:"; KokkosKernels::Impl::print_1Dview(permuted_adj); - std::cout << "permuted_adj_vals:"; KokkosKernels::Impl::print_1Dview(permuted_adj_vals); + std::cout << "permuted_xadj:"; KokkosKernels::Impl::print_1Dview(newxadj); + std::cout << "permuted_adj:"; KokkosKernels::Impl::print_1Dview(newadj); + std::cout << "permuted_adj_vals:"; KokkosKernels::Impl::print_1Dview(newadj_vals); std::cout << "permuted_diagonals:"; KokkosKernels::Impl::print_1Dview(permuted_inverse_diagonal); #endif - nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandler->get_color_xadj(); - + nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj(); - - nnz_lno_t brows = permuted_xadj.extent(0) - 1; - size_type bnnz = permuted_adj_vals.extent(0); + nnz_lno_t brows = newxadj.extent(0) - 1; + size_type bnnz = newadj_vals.extent(0); int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); @@ -1317,12 +1276,12 @@ namespace KokkosSparse{ //size_t shmem_size_to_use = this->handle->get_shmem_size(); - size_t l1_shmem_size = gsHandler->get_level_1_mem(); - nnz_lno_t num_values_in_l1 = gsHandler->get_num_values_in_l1(); + size_t l1_shmem_size = gsHandle->get_level_1_mem(); + nnz_lno_t num_values_in_l1 = gsHandle->get_num_values_in_l1(); - size_t level_2_mem = gsHandler->get_level_2_mem(); - nnz_lno_t num_values_in_l2 = gsHandler->get_num_values_in_l2(); - nnz_lno_t num_chunks = gsHandler->get_num_big_rows(); + size_t level_2_mem = gsHandle->get_level_2_mem(); + nnz_lno_t num_values_in_l2 = gsHandle->get_num_values_in_l2(); + nnz_lno_t num_chunks = gsHandle->get_num_big_rows(); pool_memory_space m_space(num_chunks, level_2_mem / sizeof(nnz_scalar_t), 0, KokkosKernels::Impl::ManyThread2OneChunk, false); @@ -1332,7 +1291,7 @@ namespace KokkosSparse{ << " num_chunks:" << num_chunks << std::endl; #endif - Team_PSGS gs(permuted_xadj, permuted_adj, permuted_adj_vals, + Team_PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector, Permuted_Yvector,0,0, permuted_inverse_diagonal, m_space, num_values_in_l1, num_values_in_l2, omega, @@ -1367,9 +1326,7 @@ namespace KokkosSparse{ KokkosKernels::Impl::print_1Dview(x_lhs_output_vec); std::cout << "Y:"; KokkosKernels::Impl::print_1Dview(Permuted_Yvector); - #endif - } template @@ -1381,55 +1338,48 @@ namespace KokkosSparse{ nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, - bool update_y_vector = true){ - - typename HandleType::GaussSeidelHandleType *gsHandler = this->handle->get_gs_handle(); - scalar_persistent_work_view_t Permuted_Yvector = gsHandler->get_permuted_y_vector(); - scalar_persistent_work_view_t Permuted_Xvector = gsHandler->get_permuted_x_vector(); - - - row_lno_persistent_work_view_t newxadj_ = gsHandler->get_new_xadj(); - nnz_lno_persistent_work_view_t old_to_new_map = gsHandler->get_old_to_new_map(); - nnz_lno_persistent_work_view_t newadj_ = gsHandler->get_new_adj(); - nnz_lno_persistent_work_view_t color_adj = gsHandler->get_color_adj(); - - color_t numColors = gsHandler->get_num_colors(); + bool update_y_vector = true) + { + auto gsHandle = get_gs_handle(); + scalar_persistent_work_view_t Permuted_Yvector = gsHandle->get_permuted_y_vector(); + scalar_persistent_work_view_t Permuted_Xvector = gsHandle->get_permuted_x_vector(); + row_lno_persistent_work_view_t newxadj = gsHandle->get_new_xadj(); + nnz_lno_persistent_work_view_t newadj = gsHandle->get_new_adj(); + scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val(); + nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map(); + nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); + scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); + color_t numColors = gsHandle->get_num_colors(); - if (update_y_vector){ + if (update_y_vector) { KokkosKernels::Impl::permute_vector ( - num_rows, - old_to_new_map, - y_rhs_input_vec, - Permuted_Yvector - ); + num_rows, + old_to_new_map, + y_rhs_input_vec, + Permuted_Yvector + ); } MyExecSpace().fence(); - if(init_zero_x_vector){ + if(init_zero_x_vector) { KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); } - else{ + else { KokkosKernels::Impl::permute_vector ( - num_cols, - old_to_new_map, - x_lhs_output_vec, - Permuted_Xvector - ); + num_cols, + old_to_new_map, + x_lhs_output_vec, + Permuted_Xvector + ); } MyExecSpace().fence(); - row_lno_persistent_work_view_t permuted_xadj = gsHandler->get_new_xadj(); - nnz_lno_persistent_work_view_t permuted_adj = gsHandler->get_new_adj(); - scalar_persistent_work_view_t permuted_adj_vals = gsHandler->get_new_adj_val(); - scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandler->get_permuted_inverse_diagonal(); - - nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandler->get_color_xadj(); - + nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj(); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "--point Before X:"; @@ -1438,10 +1388,9 @@ namespace KokkosSparse{ KokkosKernels::Impl::print_1Dview(Permuted_Yvector,true); #endif - if (gsHandler->get_algorithm_type()== GS_PERMUTED){ - PSGS gs(permuted_xadj, permuted_adj, permuted_adj_vals, + if(gsHandle->get_algorithm_type() == GS_PERMUTED) { + PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector, Permuted_Yvector, color_adj, omega, permuted_inverse_diagonal); - this->IterativePSGS( gs, numColors, @@ -1450,12 +1399,11 @@ namespace KokkosSparse{ apply_forward, apply_backward); } - else{ - - pool_memory_space m_space(0, 0, 0, KokkosKernels::Impl::ManyThread2OneChunk, false); + else { + pool_memory_space m_space(0, 0, 0, KokkosKernels::Impl::ManyThread2OneChunk, false); - Team_PSGS gs(permuted_xadj, permuted_adj, permuted_adj_vals, - Permuted_Xvector, Permuted_Yvector,0,0, permuted_inverse_diagonal, m_space,0,0,omega); + Team_PSGS gs(newxadj, newadj, newadj_vals, + Permuted_Xvector, Permuted_Yvector, 0, 0, permuted_inverse_diagonal, m_space, 0, 0, omega); this->IterativePSGS( gs, @@ -1468,14 +1416,13 @@ namespace KokkosSparse{ //Kokkos::parallel_for( my_exec_space(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_vector - ( - num_cols, - color_adj, - Permuted_Xvector, - x_lhs_output_vec - ); + ( + num_cols, + color_adj, + Permuted_Xvector, + x_lhs_output_vec + ); MyExecSpace().fence(); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "--point After X:"; @@ -1495,11 +1442,13 @@ namespace KokkosSparse{ nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, - bool update_y_vector = true){ - if (this->handle->get_gs_handle()->is_numeric_called() == false){ + bool update_y_vector = true) + { + auto gsHandle = get_gs_handle(); + if (gsHandle->is_numeric_called() == false){ this->initialize_numeric(); } - nnz_lno_t block_size = this->handle->get_gs_handle()->get_block_size(); + nnz_lno_t block_size = gsHandle->get_block_size(); if (block_size == 1){ this->point_apply( x_lhs_output_vec, y_rhs_input_vec, @@ -1511,7 +1460,8 @@ namespace KokkosSparse{ else { this->block_apply( x_lhs_output_vec, y_rhs_input_vec, - init_zero_x_vector, numIter, omega, + init_zero_x_vector, numIter, + omega, apply_forward, apply_backward, update_y_vector); } @@ -1537,7 +1487,7 @@ namespace KokkosSparse{ nnz_lno_t suggested_team_size = gs.suggested_team_size; nnz_lno_t team_row_chunk_size = gs.team_work_size; int vector_size = gs.vector_size; - nnz_lno_t block_size = this->handle->get_gs_handle()->get_block_size(); + nnz_lno_t block_size = get_gs_handle()->get_block_size(); /* size_type nnz = this->values.extent(0); @@ -1570,7 +1520,6 @@ namespace KokkosSparse{ } else { //if (i == 0) std::cout << "big block_team" << std::endl; - Kokkos::parallel_for("KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward", bigblock_team_fill_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size), gs ); @@ -1589,7 +1538,6 @@ namespace KokkosSparse{ gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; if (block_size == 1){ - Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::backward", team_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size), gs ); @@ -1630,17 +1578,30 @@ namespace KokkosSparse{ } } - - void DoPSGS(PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, bool apply_forward, bool apply_backward){ + + /* + std::cout << "Running PSGS.\n"; + std::cout << "Xvec (output):\n"; + KokkosKernels::Impl::print_1Dview(gs._Xvector); + std::cout << "Yvec (input):\n"; + KokkosKernels::Impl::print_1Dview(gs._Yvector); + std::cout << "Color xadj (host, mapping for clusters):\n"; + KokkosKernels::Impl::print_1Dview(h_color_xadj); + std::cout << "xadj (device)\n"; + KokkosKernels::Impl::print_1Dview(gs._xadj); + std::cout << "adj (device)\n"; + KokkosKernels::Impl::print_1Dview(gs._adj); + std::cout << "adjvals (device)\n"; + KokkosKernels::Impl::print_1Dview(gs._adj_vals); + */ + if (apply_forward){ - //std::cout << "numColors:" << numColors << std::endl; for (color_t i = 0; i < numColors; ++i){ nnz_lno_t color_index_begin = h_color_xadj(i); nnz_lno_t color_index_end = h_color_xadj(i + 1); - //std::cout << "i:" << i << " color_index_begin:" << color_index_begin << " color_index_end:" << color_index_end << std::endl; Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::forward", my_exec_space (color_index_begin, color_index_end) , gs); MyExecSpace().fence(); @@ -1651,7 +1612,7 @@ namespace KokkosSparse{ nnz_lno_t color_index_begin = h_color_xadj(i); nnz_lno_t color_index_end = h_color_xadj(i + 1); Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::backward", - my_exec_space (color_index_begin, color_index_end) , gs); + my_exec_space (color_index_begin, color_index_end), gs); MyExecSpace().fence(); if (i == 0){ break; @@ -1660,7 +1621,6 @@ namespace KokkosSparse{ } } }; - } } #endif diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 20cacb2b72..8ed77ff019 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -51,6 +51,7 @@ // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include "KokkosSparse_gauss_seidel_impl.hpp" +#include "KokkosSparse_cluster_gauss_seidel_impl.hpp" #endif namespace KokkosSparse { @@ -236,10 +237,21 @@ namespace KokkosSparse { a_lno_view_t entries, bool is_graph_symmetric){ - typedef typename Impl::GaussSeidel SGS; - SGS sgs(handle,num_rows, num_cols, row_map, entries, is_graph_symmetric); - sgs.initialize_symbolic(); + auto gsHandle = handle->get_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + using SGS = typename Impl::ClusterGaussSeidel + ; + SGS sgs(handle,num_rows, num_cols, row_map, entries, is_graph_symmetric); + sgs.initialize_symbolic(); + } + else + { + using SGS = typename Impl::PointGaussSeidel + ; + SGS sgs(handle,num_rows, num_cols, row_map, entries, is_graph_symmetric); + sgs.initialize_symbolic(); + } } }; @@ -255,13 +267,23 @@ namespace KokkosSparse { a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, - bool is_graph_symmetric - ){ - typedef typename Impl::GaussSeidel - SGS; - SGS sgs(handle, num_rows, num_cols, row_map, entries, values, is_graph_symmetric); - sgs.initialize_numeric(); + bool is_graph_symmetric) + { + auto gsHandle = handle->get_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + using SGS = typename Impl::ClusterGaussSeidel + ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values, is_graph_symmetric); + sgs.initialize_numeric(); + } + else + { + using SGS = typename Impl::PointGaussSeidel + ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values, is_graph_symmetric); + sgs.initialize_numeric(); + } } static void @@ -272,13 +294,23 @@ namespace KokkosSparse { a_lno_view_t entries, a_scalar_view_t values, a_scalar_view_t given_inverse_diagonal, - bool is_graph_symmetric - ){ - typedef typename Impl::GaussSeidel - SGS; - SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal, is_graph_symmetric); - sgs.initialize_numeric(); + bool is_graph_symmetric) + { + auto gsHandle = handle->get_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + using SGS = typename Impl::ClusterGaussSeidel + ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal, is_graph_symmetric); + sgs.initialize_numeric(); + } + else + { + using SGS = typename Impl::PointGaussSeidel + ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal, is_graph_symmetric); + sgs.initialize_numeric(); + } } }; @@ -299,19 +331,35 @@ namespace KokkosSparse { y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, - typename KernelHandle::nnz_scalar_t omega, int numIter, bool apply_forward, bool apply_backward){ - - typedef typename Impl::GaussSeidel SGS; - SGS sgs(handle, num_rows, num_cols, row_map, entries, values); - sgs.apply( - x_lhs_output_vec, - y_rhs_input_vec, - init_zero_x_vector, - numIter, - omega, - apply_forward, - apply_backward, update_y_vector); + typename KernelHandle::nnz_scalar_t omega, int numIter, bool apply_forward, bool apply_backward) + { + auto gsHandle = handle->get_gs_handle(); + if(gsHandle->get_algorithm_type() == GS_CLUSTER) + { + using SGS = typename Impl::ClusterGaussSeidel ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values); + sgs.apply( + x_lhs_output_vec, + y_rhs_input_vec, + init_zero_x_vector, + numIter, + omega, + apply_forward, + apply_backward, update_y_vector); + } + else + { + using SGS = typename Impl::PointGaussSeidel ; + SGS sgs(handle, num_rows, num_cols, row_map, entries, values); + sgs.apply( + x_lhs_output_vec, + y_rhs_input_vec, + init_zero_x_vector, + numIter, + omega, + apply_forward, + apply_backward, update_y_vector); + } } }; #endif diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp new file mode 100644 index 0000000000..64e5d804d4 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -0,0 +1,891 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_Utils.hpp" +#include +#include +#include +#include +#include +#include +#include +#include "KokkosBlas1_fill.hpp" +#include "KokkosGraph_Distance1Color.hpp" +#include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" + +#ifndef _KOKKOS_PARTITIONING_IMP_HPP +#define _KOKKOS_PARTITIONING_IMP_HPP + +namespace KokkosSparse{ + +namespace Impl{ + +//Fill a view such that v(i) = i +//Does the same thing as std::iota(begin, end) +template +struct IotaFunctor +{ + IotaFunctor(View& v_) : v(v_) {} + KOKKOS_INLINE_FUNCTION void operator()(const Ordinal i) const + { + v(i) = i; + } + View v; +}; + +template +struct RCM +{ + typedef typename HandleType::HandleExecSpace MyExecSpace; + typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + + typedef typename HandleType::size_type size_type; + typedef typename HandleType::nnz_lno_t nnz_lno_t; + + typedef typename lno_row_view_t::const_type const_lno_row_view_t; + typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t; + typedef typename non_const_lno_row_view_t::value_type offset_t; + + typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t; + typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t; + + typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; + typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; + typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type + + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type + + typedef nnz_lno_persistent_work_view_t nnz_view_t; + typedef Kokkos::View> single_view_t; + typedef Kokkos::View> single_view_host_t; + + typedef Kokkos::RangePolicy my_exec_space; + + typedef Kokkos::RangePolicy range_policy_t ; + typedef Kokkos::TeamPolicy team_policy_t ; + typedef typename team_policy_t::member_type team_member_t ; + + typedef nnz_lno_t LO; + + RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) + : numRows(numRows_), rowmap(rowmap_), colinds(colinds_) + {} + + nnz_lno_t numRows; + const_lno_row_view_t rowmap; + const_lno_nnz_view_t colinds; + + template + struct MaxDegreeFunctor + { + typedef typename std::remove_cv::type size_type; + MaxDegreeFunctor(Rowmap& rowmap_) : r(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(const size_type i, size_type& lmax) const + { + size_type ideg = r(i + 1) - r(i); + if(ideg > lmax) + lmax = ideg; + } + Rowmap r; + }; + + //simple parallel reduction to find max degree in graph + size_type find_max_degree() + { + size_type maxDeg = 0; + Kokkos::parallel_reduce(range_policy_t(0, numRows), MaxDegreeFunctor(rowmap), Kokkos::Max(maxDeg)); + //max degree should be computed as an offset_t, + //but must fit in a nnz_lno_t + return maxDeg; + } + + //radix sort keys according to their corresponding values ascending. + //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting + template + KOKKOS_INLINE_FUNCTION static void + radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem) + { + if(n <= 1) + return; + //sort 4 bits at a time + KeyType mask = 0xF; + bool inAux = false; + //maskPos counts the low bit index of mask (0, 4, 8, ...) + IndexType maskPos = 0; + IndexType sortBits = 0; + KeyType minKey = Kokkos::ArithTraits::max(); + KeyType maxKey = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), + [=](size_type i, KeyType& lminkey) + { + if(keys[i] < lminkey) + lminkey = keys[i]; + }, Kokkos::Min(minKey)); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), + [=](size_type i, KeyType& lmaxkey) + { + if(keys[i] > lmaxkey) + lmaxkey = keys[i]; + }, Kokkos::Max(maxKey)); + //apply a bias so that key range always starts at 0 + //also invert key values here for a descending sort + Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), + [=](size_type i) + { + keys[i] -= minKey; + }); + KeyType upperBound = maxKey - minKey; + while(upperBound) + { + upperBound >>= 1; + sortBits++; + } + for(IndexType s = 0; s < (sortBits + 3) / 4; s++) + { + //Count the number of elements in each bucket + IndexType count[16] = {0}; + IndexType offset[17]; + if(!inAux) + { + for(IndexType i = 0; i < n; i++) + { + count[(keys[i] & mask) >> maskPos]++; + } + } + else + { + for(IndexType i = 0; i < n; i++) + { + count[(keysAux[i] & mask) >> maskPos]++; + } + } + offset[0] = 0; + //get offset as the prefix sum for count + for(IndexType i = 0; i < 16; i++) + { + offset[i + 1] = offset[i] + count[i]; + } + //now for each element in [lo, hi), move it to its offset in the other buffer + //this branch should be ok because whichBuf is the same on all threads + if(!inAux) + { + //copy from *Over to *Aux + for(IndexType i = 0; i < n; i++) + { + IndexType bucket = (keys[i] & mask) >> maskPos; + keysAux[offset[bucket + 1] - count[bucket]] = keys[i]; + valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; + count[bucket]--; + } + } + else + { + //copy from *Aux to *Over + for(IndexType i = 0; i < n; i++) + { + IndexType bucket = (keysAux[i] & mask) >> maskPos; + keys[offset[bucket + 1] - count[bucket]] = keysAux[i]; + values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; + count[bucket]--; + } + } + inAux = !inAux; + mask = mask << 4; + maskPos += 4; + } + //move keys/values back from aux if they are currently in aux, + //and remove bias + if(inAux) + { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), + [=](size_type i) + { + //TODO: when everything works, is safe to remove next line + //since keys (BFS visit scores) will never be needed again + keys[i] = keysAux[i]; + values[i] = valuesAux[i]; + }); + } + } + + //Functor that does breadth-first search on a sparse graph. + struct BfsFunctor + { + typedef Kokkos::View> WorkView; + + BfsFunctor(WorkView& workQueue_, WorkView& scratch_, nnz_view_t& visit_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, single_view_t& numLevels_, nnz_view_t threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) + : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const + { + const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); + const nnz_lno_t NOT_VISITED = LNO_MAX; + const nnz_lno_t QUEUED = NOT_VISITED - 1; + int nthreads = mem.team_size(); + nnz_lno_t tid = mem.team_rank(); + auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL()); + //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively + //active, next and visitCounter are thread-local, but always kept consistent across threads + int active = 0; + int next = 1; + nnz_lno_t visitCounter = 0; + Kokkos::single(Kokkos::PerTeam(mem), + [=]() + { + workQueue(active, 0) = start; + visit(start) = QUEUED; + }); + nnz_lno_t activeQSize = 1; + nnz_lno_t nextQSize = 0; + //KK create_reverse_map() expects incoming values to start at 1 + nnz_lno_t level = 1; + //do this until all nodes have been visited and added to a level + while(visitCounter < numRows) + { + mem.team_barrier(); + //each thread works on a contiguous block of nodes in queue (for locality) + //compute in size_t to avoid possible 32-bit overflow + nnz_lno_t workStart = tid * activeQSize / nthreads; + nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads; + //the maximum work batch size (among all threads) + //the following loop contains barriers so all threads must iterate same # of times + nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads; + for(nnz_lno_t loop = 0; loop < maxBatch; loop++) + { + //this thread may not actually have anything to work on (if nthreads doesn't divide qSize) + bool busy = loop < workEnd - workStart; + nnz_lno_t neiCount = 0; + nnz_lno_t process = LNO_MAX; + if(busy) + { + process = workQueue(active, workStart + loop); + offset_t rowStart = rowmap(process); + offset_t rowEnd = rowmap(process + 1); + //build a list of all non-visited neighbors + for(offset_t j = rowStart; j < rowEnd; j++) + { + nnz_lno_t col = colinds(j); + //use atomic here to guarantee neighbors are added to neighborList exactly once + if(Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) + { + //this thread is the first to see that col needs to be queued + neighborList(neiCount) = col; + neiCount++; + } + } + } + threadNeighborCounts(tid) = neiCount; + mem.team_barrier(); + size_type queueUpdateOffset = 0; + for(nnz_lno_t i = 0; i < tid; i++) + { + queueUpdateOffset += threadNeighborCounts(i); + } + //write out all updates to next queue in parallel + if(busy) + { + nnz_lno_t nextQueueIter = 0; + for(nnz_lno_t i = 0; i < neiCount; i++) + { + nnz_lno_t toQueue = neighborList(i); + visit(toQueue) = QUEUED; + workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue; + nextQueueIter++; + } + //assign level to to process + visit(process) = level; + } + nnz_lno_t totalAdded = 0; + for(nnz_lno_t i = 0; i < nthreads; i++) + { + totalAdded += threadNeighborCounts(i); + } + nextQSize += totalAdded; + mem.team_barrier(); + } + //swap queue buffers + active = next; + next = 1 - next; + //all threads have a consistent value of qSize here. + //update visitCounter in preparation for next frontier + visitCounter += activeQSize; + activeQSize = nextQSize; + nextQSize = 0; + if(visitCounter < numRows && activeQSize == 0) + { + Kokkos::single(Kokkos::PerTeam(mem), + [=]() + { + //Some nodes are unreachable from start (graph not connected) + //Find an unvisited node to resume BFS + for(nnz_lno_t search = numRows - 1; search >= 0; search--) + { + if(visit(search) == NOT_VISITED) + { + workQueue(active, 0) = search; + visit(search) = QUEUED; + break; + } + } + }); + activeQSize = 1; + } + level++; + } + Kokkos::single(Kokkos::PerTeam(mem), + [=] + { + numLevels() = level - 1; + }); + } + + WorkView workQueue; + WorkView scratch; + nnz_view_t visit; + const_lno_row_view_t rowmap; + const_lno_nnz_view_t colinds; + single_view_t numLevels; + nnz_view_t threadNeighborCounts; + nnz_lno_t start; + nnz_lno_t numRows; + }; + + //parallel breadth-first search, producing level structure in (xadj, adj) form: + //xadj(level) gives index in adj where level begins + //also return the total number of levels + nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) + { + //need to know maximum degree to allocate scratch space for threads + maxDeg = find_max_degree(); + //view for storing the visit timestamps + nnz_view_t visit("BFS visited nodes", numRows); + const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); + const nnz_lno_t NOT_VISITED = LNO_MAX; + KokkosBlas::fill(visit, NOT_VISITED); + //the visit queue + //one of q1,q2 is active at a time and holds the nodes to process in next BFS level + //elements which are LNO_MAX are just placeholders (nothing to process) + Kokkos::View> workQueue("BFS queue (double buffered)", 2, numRows); + nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads); + single_view_t numLevels("# of BFS levels"); + single_view_host_t numLevelsHost("# of BFS levels"); + Kokkos::View> scratch("Scratch buffer shared by threads", nthreads, maxDeg); + Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows)); + Kokkos::deep_copy(numLevelsHost, numLevels); + //now that level structure has been computed, construct xadj/adj + KokkosKernels::Impl::create_reverse_map + (numRows, numLevelsHost(), visit, xadj, adj); + return numLevelsHost(); + } + + struct CuthillMcKeeFunctor + { + typedef Kokkos::View> ScoreView; + + CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, ScoreView& scores_, ScoreView& scoresAux_, nnz_view_t& visit_, nnz_view_t& xadj_, nnz_view_t& adj_, nnz_view_t& adjAux_) + : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const + { + int tid = mem.team_rank(); + int nthreads = mem.team_size(); + const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); + nnz_lno_t visitCounter = 0; + for(nnz_lno_t level = 0; level < numLevels; level++) + { + //iterate over vertices in this level and compute + //min predecessors (minimum-labeled vertices from previous level) + nnz_lno_t levelOffset = xadj(level); + nnz_lno_t levelSize = xadj(level + 1) - levelOffset; + //compute as offset_t to avoid overflow, but the upper bound on + //the scores is approx. numRows * maxDegree, which should be representable + nnz_lno_t workStart = tid * levelSize / nthreads; + nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads; + for(nnz_lno_t i = workStart; i < workEnd; i++) + { + nnz_lno_t process = adj(levelOffset + i); + nnz_lno_t minNeighbor = LNO_MAX; + offset_t rowStart = rowmap(process); + offset_t rowEnd = rowmap(process + 1); + for(offset_t j = rowStart; j < rowEnd; j++) + { + nnz_lno_t neighbor = colinds(j); + nnz_lno_t neighborVisit = visit(neighbor); + if(neighborVisit < minNeighbor) + minNeighbor = neighborVisit; + } + scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process)); + } + mem.team_barrier(); + Kokkos::single(Kokkos::PerTeam(mem), + [=]() + { + radixSortKeysAndValues + (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem); + }); + mem.team_barrier(); + //label all vertices (which are now in label order within their level) + for(nnz_lno_t i = workStart; i < workEnd; i++) + { + nnz_lno_t process = adj(levelOffset + i); + //visit counter increases with levels, so flip the range for the "reverse" in RCM + visit(process) = visitCounter + i; + } + visitCounter += levelSize; + } + } + + nnz_lno_t numLevels; + nnz_lno_t maxDegree; + const_lno_row_view_t rowmap; + const_lno_nnz_view_t colinds; + ScoreView scores; + ScoreView scoresAux; + nnz_view_t visit; + //The levels, stored in CRS format. + //xadj stores offsets for each level, and adj stores the rows in each level. + nnz_view_t xadj; + nnz_view_t adj; + nnz_view_t adjAux; + }; + + //Does the reversing in "reverse Cuthill-McKee") + struct OrderReverseFunctor + { + OrderReverseFunctor(nnz_view_t& visit_, nnz_lno_t numRows_) + : visit(visit_), numRows(numRows_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const + { + visit(i) = numRows - visit(i) - 1; + } + nnz_view_t visit; + nnz_lno_t numRows; + }; + + //breadth-first search, producing a reverse Cuthill-McKee ordering + nnz_view_t parallel_cuthill_mckee(nnz_lno_t start) + { + size_type nthreads = MyExecSpace::concurrency(); + if(nthreads > 64) + nthreads = 64; + #ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + { + nthreads = 256; + } + #endif + nnz_view_t xadj, adj; + nnz_lno_t maxDegree = 0; + //parallel_bfs will compute maxDegree + auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads); + nnz_lno_t maxLevelSize = 0; + Kokkos::parallel_reduce(range_policy_t(0, numLevels), MaxDegreeFunctor(xadj), Kokkos::Max(maxLevelSize)); + //visit (to be returned) contains the RCM numberings of each row + nnz_view_t visit("RCM labels", numRows); + //Populate visit wth LNO_MAX so that the "min-labeled neighbor" + //is always a node in the previous level + const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); + KokkosBlas::fill(visit, LNO_MAX); + //the "score" of a node is a single value that provides an ordering equivalent + //to sorting by min predecessor and then by min degree + //reduce nthreads to be a power of 2 + Kokkos::View> scores("RCM scores for sorting", maxLevelSize); + Kokkos::View> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize); + nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize); + Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux)); + //reverse the visit order (for the 'R' in RCM) + Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows)); + return visit; + } + + template + struct MinDegreeRowFunctor + { + typedef typename Reducer::value_type Value; + MinDegreeRowFunctor(const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} + KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const + { + size_type ideg = rowmap(i + 1) - rowmap(i); + if(ideg < lval.val) + { + lval.val = ideg; + lval.loc = i; + } + } + const_lno_row_view_t rowmap; + }; + + //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM) + struct OrderToClusterFunctor + { + OrderToClusterFunctor(const nnz_view_t& ordering_, nnz_view_t vertClusters_, nnz_lno_t clusterSize_) + : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const + { + vertClusters(i) = ordering(i) / clusterSize; + } + + const nnz_view_t ordering; + nnz_view_t vertClusters; + nnz_lno_t clusterSize; + }; + + //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS + nnz_lno_t find_peripheral() + { + typedef Kokkos::MinLoc MinLocReducer; + typedef typename MinLocReducer::value_type MinLocVal; + MinLocVal v; + Kokkos::parallel_reduce(range_policy_t(0, numRows), + MinDegreeRowFunctor(rowmap), MinLocReducer(v)); + return v.loc; + } + + size_type countBlockDiagEntries(nnz_view_t labels, nnz_lno_t blockSize) + { + auto labelsHost = Kokkos::create_mirror_view(labels); + Kokkos::deep_copy(labelsHost, labels); + auto rowmapHost = Kokkos::create_mirror_view(rowmap); + Kokkos::deep_copy(rowmapHost, rowmap); + auto colindsHost = Kokkos::create_mirror_view(colinds); + Kokkos::deep_copy(colindsHost, colinds); + size_type num = 0; + for(nnz_lno_t row = 0; row < numRows; row++) + { + nnz_lno_t rowLabel = labelsHost(row); + for(size_type j = rowmapHost(row); j < rowmapHost(row + 1); j++) + { + nnz_lno_t colLabel = labelsHost(colindsHost(j)); + if(rowLabel / blockSize == colLabel / blockSize) + num++; + } + } + return num; + } + + nnz_view_t cuthill_mckee() + { + nnz_lno_t periph = find_peripheral(); + //run Cuthill-McKee BFS from periph + auto ordering = parallel_cuthill_mckee(periph); + return ordering; + } + + nnz_view_t rcm() + { + nnz_view_t cm = cuthill_mckee(); + //reverse the visit order (for the 'R' in RCM) + Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows)); + return cm; + } + + nnz_view_t cm_cluster(nnz_lno_t clusterSize) + { + nnz_view_t cm = cuthill_mckee(); + nnz_view_t vertClusters("Vert to cluster", numRows); + OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize); + Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters); + return vertClusters; + } +}; + +template +struct BalloonClustering +{ + typedef typename HandleType::HandleExecSpace MyExecSpace; + typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + + typedef typename HandleType::size_type size_type; + typedef typename HandleType::nnz_lno_t nnz_lno_t; + + typedef typename lno_row_view_t::value_type offset_t; + + typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; + typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; + typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type + + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type + + typedef nnz_lno_persistent_work_view_t nnz_view_t; + typedef Kokkos::View float_view_t; + //typedef Kokkos::View> single_view_t; + //typedef Kokkos::View> single_view_host_t; + + typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::Bitset bitset_t; + + typedef Kokkos::RangePolicy range_policy_t ; + typedef Kokkos::TeamPolicy team_policy_t ; + typedef typename team_policy_t::member_type team_member_t ; + + BalloonClustering(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) + : numRows(numRows_), rowmap(rowmap_), colinds(colinds_), randPool(0xDEADBEEF) + {} + + nnz_lno_t numRows; + lno_row_view_t rowmap; + lno_nnz_view_t colinds; + + typedef Kokkos::Random_XorShift64_Pool RandPool; + RandPool randPool; + + struct InitRootsTag {}; //select roots; set their distances to 0 + struct RandomFillTag {}; //assign non-roots to random clusters, and assign large random distances + struct UpdatePressureTag {}; + struct BalloonTag {}; //run the "balloon" procedure, where each cluster tries to inflate up to clusterSize + + struct BalloonFunctor + { + BalloonFunctor(nnz_view_t& vertClusters_, nnz_view_t& clusterCounts_, nnz_view_t& distances_, lno_row_view_t& row_map_, lno_nnz_view_t& col_inds_, float_view_t pressure_, nnz_lno_t clusterSize_, RandPool& randPool_) + : vertClusters(vertClusters_), clusterCounts(clusterCounts_), distances(distances_), row_map(row_map_), col_inds(col_inds_), pressure(pressure_), clusterSize(clusterSize_), numRows(row_map.extent(0) - 1), vertLocks(numRows), randPool(randPool_) + { + numClusters = (numRows + clusterSize - 1) / clusterSize; + avgClusterSize = (double) numRows / numClusters; + iter = 0; + } + + //Run init version over the number of clusters. + KOKKOS_INLINE_FUNCTION void operator()(const InitRootsTag, const nnz_lno_t i) const + { + nnz_lno_t root; + auto state = randPool.get_state(); + do + { + root = state.rand(numRows); + } + while(!Kokkos::atomic_compare_exchange_strong(&vertClusters(root), numClusters, i)); + randPool.free_state(state); + distances(root) = 0; + pressure(root) = 1; + } + + KOKKOS_INLINE_FUNCTION void operator()(const RandomFillTag, const nnz_lno_t i) const + { + if(vertClusters(i) == numClusters) + { + auto state = randPool.get_state(); + nnz_lno_t cluster = state.rand(numClusters); + randPool.free_state(state); + vertClusters(i) = cluster; + Kokkos::atomic_increment(&clusterCounts(cluster)); + distances(i) = numRows; + pressure(i) = 0.1; + } + }; + + KOKKOS_INLINE_FUNCTION void operator()(const UpdatePressureTag, const nnz_lno_t i) const + { + nnz_lno_t cluster = vertClusters(i); + if(cluster == numClusters) + { + //unassigned vertices have 0 pressure + return; + } + //count the number of neighbors in the same cluster + nnz_lno_t sameClusterNeighbors = 0; + for(size_type j = row_map(i); j < row_map(i + 1); j++) + { + nnz_lno_t nei = col_inds(j); + if(nei != i && vertClusters(nei) == cluster) + { + //while we're at it, minimize distance to root as in Djikstra's + if(distances(nei) + 1 < distances(i)) + distances(i) = distances(nei) + 1; + sameClusterNeighbors++; + } + } + nnz_lno_t curSize = clusterCounts(cluster); + //update pressure, if cluster is undersized + nnz_lno_t shortage = clusterSize - curSize; + float pressureChange = (shortage * shortage * (1.0f + 0.2f * sameClusterNeighbors)) / (1.0f + distances(i)); + if(shortage > 0) + pressure(i) += pressureChange; + } + + KOKKOS_INLINE_FUNCTION void operator()(const BalloonTag, const nnz_lno_t i, double& sizeDeviation) const + { + nnz_lno_t cluster = vertClusters(i); + if(cluster == numClusters) + return; + //find the weakest affinity neighbor + nnz_lno_t weakNei = numRows; + float weakestPressure = pressure(i); + nnz_lno_t weakNeiCluster = numClusters; + for(size_type j = row_map(i); j < row_map(i + 1); j++) + { + nnz_lno_t nei = col_inds(j); + //to annex another vertex, it must be a non-root in a different cluster + if(nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0) + { + weakNei = nei; + weakestPressure = pressure(nei); + weakNeiCluster = vertClusters(nei); + } + } + if(weakNei != numRows && clusterCounts(cluster) < clusterSize) + { + //this cluster will take over weakNei + if(vertLocks.set(i)) + { + if(vertLocks.set(weakNei)) + { + Kokkos::atomic_increment(&clusterCounts(cluster)); + if(weakNeiCluster != numClusters) + Kokkos::atomic_decrement(&clusterCounts(weakNeiCluster)); + vertClusters(weakNei) = cluster; + pressure(i) -= pressure(weakNei); + pressure(weakNei) = pressure(i); + distances(weakNei) = distances(i) + 1; + vertLocks.reset(weakNei); + } + vertLocks.reset(i); + } + } + if(distances(i) == 0) + { + //roots update sizeDeviation on behalf of the cluster + double deviation = clusterCounts(cluster) - avgClusterSize; + sizeDeviation += deviation * deviation; + } + } + + nnz_view_t vertClusters; + nnz_view_t clusterCounts; + nnz_view_t distances; + //row_map/col_inds of input graph (read-only) + lno_row_view_t row_map; + lno_nnz_view_t col_inds; + float_view_t pressure; + //constants + nnz_lno_t clusterSize; + nnz_lno_t numClusters; + nnz_lno_t numRows; + nnz_lno_t iter; + Kokkos::Bitset vertLocks; + RandPool randPool; + double avgClusterSize; + }; + + nnz_view_t run(nnz_lno_t clusterSize) + { + nnz_view_t vertClusters("Vertex cluster labels", numRows); + //For the sake of completeness, handle the clusterSize = 1 case by generating a trivial (identity) clustering. + if(clusterSize == 1) + { + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), IotaFunctor(vertClusters)); + return vertClusters; + } + nnz_lno_t numClusters = (numRows + clusterSize - 1) / clusterSize; + nnz_view_t distances("Root distances", numRows); + nnz_view_t clusterCounts("Vertices per cluster", numClusters); + float_view_t pressure("Cluster pressure", numRows); + Kokkos::deep_copy(clusterCounts, 1); + Kokkos::deep_copy(vertClusters, (nnz_lno_t) numClusters); + Kokkos::deep_copy(distances, numRows); + BalloonFunctor funct(vertClusters, clusterCounts, distances, rowmap, colinds, pressure, clusterSize, randPool); + Kokkos::Impl::Timer globalTimer; + Kokkos::Impl::Timer timer; + timer.reset(); + Kokkos::parallel_for(Kokkos::RangePolicy(0, numClusters), funct); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); + std::cout << "Creating roots: " << timer.seconds() << '\n'; + timer.reset(); +#endif + double stoppingRMS = sqrt(numClusters * (0.02 * clusterSize) * (0.02 * clusterSize)); + double deviation = (double) numClusters * (clusterSize - 1) * (clusterSize - 1); + int regressions = 0; + while(true) + { + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), funct); + double iterDeviation = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRows), funct, Kokkos::Sum(iterDeviation)); + if(iterDeviation <= stoppingRMS || iterDeviation == deviation) + { + //got within 2% RMS of optimal, or stagnated + deviation = iterDeviation; + break; + } + else if(iterDeviation >= deviation) + { + regressions++; + if(regressions == 3) + { + deviation = iterDeviation; + break; + } + } + deviation = iterDeviation; + funct.iter++; + } +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); + std::cout << "Expanding clusters for " << funct.iter << " iterations: " << timer.seconds() << '\n'; + timer.reset(); +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), funct); +#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); + std::cout << "Randomly assigning clusters to remaining: " << timer.seconds() << '\n'; + std::cout << "Clustering total: " << globalTimer.seconds() << "\n\n"; +#endif + return vertClusters; + } +}; + +}} //KokkosSparse::Impl + +#endif + diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 440e348564..fb55eacd6e 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -51,10 +51,12 @@ #include #include #include +#include #include #include #include #include "KokkosSparse_gauss_seidel.hpp" +#include "KokkosSparse_partitioning_impl.hpp" #ifndef kokkos_complex_double #define kokkos_complex_double Kokkos::complex @@ -81,11 +83,9 @@ int run_gauss_seidel_1( ){ typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; - typedef typename graph_t::entries_type lno_nnz_view_t; + typedef typename graph_t::entries_type lno_nnz_view_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - typedef typename lno_view_t::value_type size_type; typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; @@ -248,14 +248,182 @@ void test_gauss_seidel(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_ //device::execution_space::finalize(); } +template +void test_cluster_sgs(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) +{ + using namespace Test; + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename graph_t::row_map_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename device::execution_space exec_space; + typedef KokkosKernelsHandle + KernelHandle; + crsMat_t A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows,numRows,nnz,row_size_variance, bandwidth); + //create a randomized RHS vector (b) + scalar_view_t b("b", numRows); + { + Kokkos::View bHost("b (hosts)", numRows); + for(lno_t i = 0; i < numRows; i++) + { + bHost(i) = (double) rand() / RAND_MAX; + } + Kokkos::deep_copy(b, bHost); + } + const double bnorm = KokkosBlas::nrm2(b); + for(int algoCount = 0; algoCount < (int) KokkosSparse::NUM_CLUSTERING_ALGORITHMS; algoCount++) + { + ClusteringAlgorithm algo = (ClusteringAlgorithm) algoCount; + //create solution vector (x), zero initial guess + //try a bunch of powers of 2 for cluster sizes, up to about the + //point where there are very few clusters (8), even though this + //is not the intended use case. + std::vector clusterSizes; + for(lno_t i = 1; i <= numRows / 8; i <<= 1) + clusterSizes.push_back(i); + const int niters = 1; + for(size_t test = 0; test < clusterSizes.size(); test++) + { + auto clusterSize = clusterSizes[test]; + KernelHandle kh; + if(clusterSize == 1) + kh.create_gs_handle(KokkosSparse::GS_DEFAULT); + else + kh.create_gs_handle((KokkosSparse::ClusteringAlgorithm) algo, clusterSize); + //only need to do G-S setup (symbolic/numeric) once + Kokkos::Impl::Timer timer; + KokkosSparse::Experimental::gauss_seidel_symbolic + (&kh, numRows, numRows, A.graph.row_map, A.graph.entries, false); + KokkosSparse::Experimental::gauss_seidel_numeric + (&kh, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, false); + #ifdef CLUSTER_VERBOSE + output << "Cluster size " << clusterSize << " setup time: " << timer.seconds() << " s\n"; + #endif + timer.reset(); + typedef Kokkos::Details::ArithTraits KAT; + scalar_t alpha = KAT::one(); + scalar_t beta = -KAT::one(); + //starting solution will be the zero vector, but make sure SGS can do that + scalar_view_t x(Kokkos::ViewAllocateWithoutInitializing("x"), numRows); + KokkosSparse::Experimental::symmetric_gauss_seidel_apply + + (&kh, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, x, b, true, true, 1.0, niters); + scalar_view_t res("Ax-b", numRows); + Kokkos::deep_copy(res, b); + KokkosSparse::spmv + ("N", alpha, A, x, beta, res, KokkosSparse::RANK_ONE()); + double scaledNorm = KokkosBlas::nrm2(res) / bnorm; + //Sanity check the result. The input matrix is strictly diagonally dominant, so the scaled solution norm (after 1 iteration) + //should be nonnegative but also small. + EXPECT_TRUE(scaledNorm > 0); + EXPECT_TRUE(scaledNorm < 0.05); + #ifdef CLUSTER_VERBOSE + std::cout << "Scaled residual norm: " << (norm / bnorm) << " (algo = " << KokkosSparse::getClusterAlgoName(algo) << ", clustersize = " << clusterSize << ")\n"; + output << "Cluster size " << clusterSize << " apply time: " << timer.seconds() << " s\n"; + output << "Cluster size " << clusterSize << " norm after " << niters << " sweeps: " << norm << '\n'; + output << "Cluster size " << clusterSize << " proportion of residual eliminated: " << 1.0 - (norm / bnorm) << std::endl; + #endif + kh.destroy_gs_handle(); + } + } +} + +template +void test_rcm(lno_t numRows, size_type nnzPerRow, lno_t bandwidth) +{ + using namespace Test; + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_row_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef KokkosKernelsHandle + KernelHandle; + srand(245); + size_type nnzTotal = nnzPerRow * numRows; + lno_t nnzVariance = nnzPerRow / 4; + crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numRows, nnzTotal, nnzVariance, bandwidth); + lno_row_view_t symRowmap; + lno_nnz_view_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries); + typedef KokkosSparse::Impl::RCM rcm_t; + rcm_t rcm(numRows, symRowmap, symEntries); + lno_nnz_view_t rcmOrder = rcm.rcm(); + //perm(i) = the node with timestamp i + //make sure that perm is in fact a permutation matrix (contains each row exactly once) + Kokkos::View rcmHost("RCM row ordering", numRows); + Kokkos::deep_copy(rcmHost, rcmOrder); + std::set rowSet; + for(lno_t i = 0; i < numRows; i++) + rowSet.insert(rcmHost(i)); + if((lno_t) rowSet.size() != numRows) + { + std::cerr << "Only got back " << rowSet.size() << " unique row IDs!\n"; + return; + } + //make a new CRS graph based on permuting the rows and columns of mat +} +template +void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth) +{ + using namespace Test; + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type const_lno_row_view_t; + typedef typename graph_t::entries_type const_lno_nnz_view_t; + typedef typename graph_t::row_map_type::non_const_type lno_row_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef KokkosKernelsHandle + KernelHandle; + srand(245); + size_type nnzTotal = nnzPerRow * numRows; + lno_t nnzVariance = nnzPerRow / 4; + crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numRows, nnzTotal, nnzVariance, bandwidth); + lno_row_view_t symRowmap; + lno_nnz_view_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries); + KokkosSparse::Impl::BalloonClustering balloon(numRows, symRowmap, symEntries); + for(int clusterSize = 1; clusterSize <= numRows / 16; clusterSize = std::ceil(clusterSize * 1.3)) + { + auto vertClusters = balloon.run(clusterSize); + //validate results: make sure cluster labels are in bounds, and that the number of clusters is correct + auto vertClustersHost = Kokkos::create_mirror_view(vertClusters); + Kokkos::deep_copy(vertClustersHost, vertClusters); + lno_t numClusters = (numRows + clusterSize - 1) / clusterSize; + //check the hard constraints of the clustering + std::set uniqueClusterIDs; + for(lno_t i = 0; i < numRows; i++) + { + EXPECT_TRUE(vertClustersHost(i) >= 0); + EXPECT_TRUE(vertClustersHost(i) < numClusters); + uniqueClusterIDs.insert(vertClustersHost(i)); + } + EXPECT_TRUE(uniqueClusterIDs.size() == static_cast(numClusters)); + } +} #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel(10000, 10000 * 30, 200, 10); \ +} \ +TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_rcm(10000, 50, 2000); \ +} \ +TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_balloon_clustering(5000, 100, 2000); \ +} \ +TEST_F( TestCategory, sparse ## _ ## cluster_sgs ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_cluster_sgs(10000, 10000 * 30, 200, 10); \ } - #if (defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -353,6 +521,3 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel ## _ ## SCALAR ## _ ## ORDINAL EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif - - -