diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h index 718922341dac31..37bb9e7986cd8a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h @@ -144,8 +144,7 @@ struct SparseCompilerOptions /// Projects out the options for `createSparsificationPass`. SparsificationOptions sparsificationOptions() const { - return SparsificationOptions(parallelization, enableGPULibgen, - enableRuntimeLibrary); + return SparsificationOptions(parallelization, enableRuntimeLibrary); } /// Projects out the options for `createConvertVectorToLLVMPass`. diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h index b124364f8cb1f0..e93e2aefb344fd 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -74,15 +74,11 @@ std::unique_ptr createPreSparsificationRewritePass(); /// Options for the Sparsification pass. struct SparsificationOptions { - SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen, - bool enableRT) - : parallelizationStrategy(p), enableGPULibgen(gpuLibgen), - enableRuntimeLibrary(enableRT) {} + SparsificationOptions(SparseParallelizationStrategy p, bool enableRT) + : parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {} SparsificationOptions() - : SparsificationOptions(SparseParallelizationStrategy::kNone, false, - true) {} + : SparsificationOptions(SparseParallelizationStrategy::kNone, true) {} SparseParallelizationStrategy parallelizationStrategy; - bool enableGPULibgen; bool enableRuntimeLibrary; }; @@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT); std::unique_ptr createSparseGPUCodegenPass(); -std::unique_ptr createSparseGPUCodegenPass(unsigned numThreads); +std::unique_ptr createSparseGPUCodegenPass(unsigned numThreads, + bool enableRT); //===----------------------------------------------------------------------===// // The SparseStorageSpecifierToLLVM pass. @@ -225,7 +222,7 @@ std::unique_ptr createSparsificationAndBufferizationPass( const SparsificationOptions &sparsificationOptions, bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vectorLength, - bool enableVLAVectorization, bool enableSIMDIndex32); + bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen); //===----------------------------------------------------------------------===// // Registration. diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td index dde138b4c99afe..f38779ed9ed2b8 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> { "affine::AffineDialect", "arith::ArithDialect", "bufferization::BufferizationDialect", - "gpu::GPUDialect", "LLVM::LLVMDialect", "linalg::LinalgDialect", "memref::MemRefDialect", @@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> { clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, "any-storage-any-loop", "Enable sparse parallelization for any storage and loop."))}]>, - Option<"enableGPULibgen", "enable-gpu-libgen", "bool", - "false", - "Enable GPU acceleration by means of direct library calls (like cuSPARSE)">, Option<"enableRuntimeLibrary", "enable-runtime-library", "bool", "true", "Enable runtime library for manipulating sparse tensors">, ]; @@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> { def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> { let summary = "Generates GPU code during sparsification"; let description = [{ - Enables the sparsifier to use GPU acceleration. + Enables the sparsifier to use GPU acceleration. When the number of GPU + threads is set to zero, the pass tries to enable GPU acceleration by + means of direct library calls (like cuSPARSE). }]; let constructor = "mlir::createSparseGPUCodegenPass()"; let dependentDialects = [ @@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> { "sparse_tensor::SparseTensorDialect", ]; let options = [ - Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">, + Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">, + Option<"enableRuntimeLibrary", "enable-runtime-library", "bool", + "true", "Enable runtime library for manipulating sparse tensors">, ]; } diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index 3ed8bba2514aaf..6ee48482ad6ef8 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -31,7 +31,10 @@ void mlir::sparse_tensor::buildSparseCompiler( OpPassManager &pm, const SparseCompilerOptions &options) { + // Rewrite named linalg ops into generic ops. pm.addNestedPass(createLinalgGeneralizationPass()); + + // Sparsification and bufferization mini-pipeline. pm.addPass(createSparsificationAndBufferizationPass( getBufferizationOptionsForSparsification( options.testBufferizationAnalysisOnly), @@ -39,10 +42,14 @@ void mlir::sparse_tensor::buildSparseCompiler( options.enableRuntimeLibrary, options.enableBufferInitialization, options.vectorLength, /*enableVLAVectorization=*/options.armSVE, - /*enableSIMDIndex32=*/options.force32BitVectorIndices)); + /*enableSIMDIndex32=*/options.force32BitVectorIndices, + options.enableGPULibgen)); + + // Bail-early for test setup. if (options.testBufferizationAnalysisOnly) return; + // Storage specifier lowering and bufferization wrap-up. pm.addPass(createStorageSpecifierToLLVMPass()); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass( @@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler( pm.addNestedPass(createConvertMathToLLVMPass()); pm.addPass(createConvertMathToLibmPass()); pm.addPass(createConvertComplexToLibmPass()); + // Repeat convert-vector-to-llvm. pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions())); + pm.addPass(createConvertComplexToLLVMPass()); pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions())); pm.addPass(createConvertFuncToLLVMPass()); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp index c139fcc8135154..375e10f9068e43 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -82,19 +82,15 @@ struct SparsificationPass SparsificationPass(const SparsificationPass &pass) = default; SparsificationPass(const SparsificationOptions &options) { parallelization = options.parallelizationStrategy; - enableGPULibgen = options.enableGPULibgen; enableRuntimeLibrary = options.enableRuntimeLibrary; } void runOnOperation() override { auto *ctx = &getContext(); // Translate strategy flags to strategy options. - SparsificationOptions options(parallelization, enableGPULibgen, - enableRuntimeLibrary); - // Apply GPU libgen (if requested), sparsification, and cleanup rewriting. + SparsificationOptions options(parallelization, enableRuntimeLibrary); + // Apply sparsification and cleanup rewriting. RewritePatternSet patterns(ctx); - if (enableGPULibgen) - populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary); populateSparsificationPatterns(patterns, options); scf::ForOp::getCanonicalizationPatterns(patterns, ctx); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); @@ -323,12 +319,18 @@ struct SparseGPUCodegenPass : public impl::SparseGPUCodegenBase { SparseGPUCodegenPass() = default; SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default; - SparseGPUCodegenPass(unsigned nT) { numThreads = nT; } + SparseGPUCodegenPass(unsigned nT, bool enableRT) { + numThreads = nT; + enableRuntimeLibrary = enableRT; + } void runOnOperation() override { auto *ctx = &getContext(); RewritePatternSet patterns(ctx); - populateSparseGPUCodegenPatterns(patterns, numThreads); + if (numThreads == 0) + populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary); + else + populateSparseGPUCodegenPatterns(patterns, numThreads); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } }; @@ -457,8 +459,9 @@ std::unique_ptr mlir::createSparseGPUCodegenPass() { return std::make_unique(); } -std::unique_ptr mlir::createSparseGPUCodegenPass(unsigned numThreads) { - return std::make_unique(numThreads); +std::unique_ptr mlir::createSparseGPUCodegenPass(unsigned numThreads, + bool enableRT) { + return std::make_unique(numThreads, enableRT); } std::unique_ptr mlir::createStorageSpecifierToLLVMPass() { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp index e20b98add19adb..94b25a358e804a 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp @@ -65,7 +65,7 @@ class SparsificationAndBufferizationPass const SparsificationOptions &sparsificationOptions, bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vectorLength, - bool enableVLAVectorization, bool enableSIMDIndex32) + bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) : bufferizationOptions(bufferizationOptions), sparsificationOptions(sparsificationOptions), createSparseDeallocs(createSparseDeallocs), @@ -73,7 +73,8 @@ class SparsificationAndBufferizationPass enableBufferInitialization(enableBufferInitialization), vectorLength(vectorLength), enableVLAVectorization(enableVLAVectorization), - enableSIMDIndex32(enableSIMDIndex32) {} + enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) { + } /// Bufferize all dense ops. This assumes that no further analysis is needed /// and that all required buffer copies were already inserted by @@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass // of `bufferization.alloc_tensor` ops. { OpPassManager pm("builtin.module"); + if (enableGPULibgen) + pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary)); pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll)); pm.addPass(createSparsificationPass(sparsificationOptions)); pm.addNestedPass(createStageSparseOperationsPass()); @@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass unsigned vectorLength; bool enableVLAVectorization; bool enableSIMDIndex32; + bool enableGPULibgen; }; } // namespace sparse_tensor @@ -210,7 +214,8 @@ std::unique_ptr mlir::createSparsificationAndBufferizationPass() { /*enableBufferInitialization=*/false, /*vectorLength=*/0, /*enableVLAVectorization=*/false, - /*enableSIMDIndex32=*/false); + /*enableSIMDIndex32=*/false, + /*enableGPULibgen=*/false); } std::unique_ptr mlir::createSparsificationAndBufferizationPass( @@ -218,10 +223,10 @@ std::unique_ptr mlir::createSparsificationAndBufferizationPass( const SparsificationOptions &sparsificationOptions, bool createSparseDeallocs, bool enableRuntimeLibrary, bool enableBufferInitialization, unsigned vectorLength, - bool enableVLAVectorization, bool enableSIMDIndex32) { + bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) { return std::make_unique< mlir::sparse_tensor::SparsificationAndBufferizationPass>( bufferizationOptions, sparsificationOptions, createSparseDeallocs, enableRuntimeLibrary, enableBufferInitialization, vectorLength, - enableVLAVectorization, enableSIMDIndex32); + enableVLAVectorization, enableSIMDIndex32, enableGPULibgen); } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir index 73161bdb135ca4..34189d329cc41e 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s --linalg-generalize-named-ops \ -// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }> diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir index 9973050d40799d..f584977e96415b 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s --linalg-generalize-named-ops \ -// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s // CHECK-LABEL: func.func @matmul( // CHECK-SAME: %[[VAL_0:.*0]]: tensor, diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir index 50ff81cb6ecd0a..bd0bf6927b0da4 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s --linalg-generalize-named-ops \ -// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s #SortedCOO = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton) diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir index 221bda47291ebf..ce7af53bb34627 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s #trait_sampled_dense_dense = { indexing_maps = [ diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir index 6afb626625cfe2..dd79a9017f7f4c 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s #BSR = #sparse_tensor.encoding<{ map = (i, j) -> ( diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir index 027c9fda5da90e..7ac37c1c4950c0 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s --linalg-generalize-named-ops \ -// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s +// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir old mode 100755 new mode 100644 diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir old mode 100755 new mode 100644 diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir index 6c3d67e2ea78dc..735dc8cb4bb361 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir @@ -85,32 +85,30 @@ module { // A kernel that computes a BSR sampled dense matrix matrix multiplication // using a "spy" function and in-place update of the sampling sparse matrix. // - // TODO: re-enable the following test. - // - // func.func @SDDMM_block(%args: tensor, - // %arga: tensor, - // %argb: tensor) -> tensor { - // %result = linalg.generic #trait_SDDMM - // ins(%arga, %argb: tensor, tensor) - // outs(%args: tensor) { - // ^bb(%a: f32, %b: f32, %s: f32): - // %f0 = arith.constant 0.0 : f32 - // %u = sparse_tensor.unary %s : f32 to f32 - // present={ - // ^bb0(%p: f32): - // %mul = arith.mulf %a, %b : f32 - // sparse_tensor.yield %mul : f32 - // } - // absent={} - // %r = sparse_tensor.reduce %s, %u, %f0 : f32 { - // ^bb0(%p: f32, %q: f32): - // %add = arith.addf %p, %q : f32 - // sparse_tensor.yield %add : f32 - // } - // linalg.yield %r : f32 - // } -> tensor - // return %result : tensor - // } + func.func @SDDMM_block(%args: tensor, + %arga: tensor, + %argb: tensor) -> tensor { + %result = linalg.generic #trait_SDDMM + ins(%arga, %argb: tensor, tensor) + outs(%args: tensor) { + ^bb(%a: f32, %b: f32, %s: f32): + %f0 = arith.constant 0.0 : f32 + %u = sparse_tensor.unary %s : f32 to f32 + present={ + ^bb0(%p: f32): + %mul = arith.mulf %a, %b : f32 + sparse_tensor.yield %mul : f32 + } + absent={} + %r = sparse_tensor.reduce %s, %u, %f0 : f32 { + ^bb0(%p: f32, %q: f32): + %add = arith.addf %p, %q : f32 + sparse_tensor.yield %add : f32 + } + linalg.yield %r : f32 + } -> tensor + return %result : tensor + } func.func private @getTensorFilename(index) -> (!Filename) @@ -153,15 +151,15 @@ module { // %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) %m_csr = sparse_tensor.new %fileName : !Filename to tensor - // %m_bsr = sparse_tensor.new %fileName : !Filename to tensor + %m_bsr = sparse_tensor.new %fileName : !Filename to tensor // Call the kernel. %0 = call @SDDMM(%m_csr, %a, %b) : (tensor, tensor, tensor) -> tensor - // %1 = call @SDDMM_block(%m_bsr, %a, %b) - // : (tensor, - // tensor, tensor) -> tensor + %1 = call @SDDMM_block(%m_bsr, %a, %b) + : (tensor, + tensor, tensor) -> tensor // // Print the result for verification. Note that the "spy" determines what @@ -170,18 +168,18 @@ module { // in the original zero positions). // // CHECK: ( 5, 10, 24, 19, 53, 42, 55, 56 ) - // C_HECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 ) + // CHECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 ) // %v0 = sparse_tensor.values %0 : tensor to memref %vv0 = vector.transfer_read %v0[%c0], %d0 : memref, vector<8xf32> vector.print %vv0 : vector<8xf32> - // %v1 = sparse_tensor.values %1 : tensor to memref - // %vv1 = vector.transfer_read %v1[%c0], %d0 : memref, vector<12xf32> - // vector.print %vv1 : vector<12xf32> + %v1 = sparse_tensor.values %1 : tensor to memref + %vv1 = vector.transfer_read %v1[%c0], %d0 : memref, vector<12xf32> + vector.print %vv1 : vector<12xf32> // Release the resources. bufferization.dealloc_tensor %0 : tensor - // bufferization.dealloc_tensor %1 : tensor + bufferization.dealloc_tensor %1 : tensor llvm.call @mgpuDestroySparseEnv() : () -> () return