Skip to content

Commit

Permalink
[TEST] Use LABEL scopes in lit tests (triton-lang#3293)
Browse files Browse the repository at this point in the history
It's quite easy to unexpectedly use a match from a different CHECK-LABEL
in a lit test, which is likely not what the author intended.

Prevent this from happening by invoking FileCheck with variable scopes
enabled (and fix the failures detected this way). This also makes the
file check use in Triton consistent with upstream MLIR.
  • Loading branch information
gflegar authored and htyu committed Mar 20, 2024
1 parent 6232c9f commit e06a990
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 12 deletions.
6 changes: 3 additions & 3 deletions test/Conversion/triton_ops.mlir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: triton-opt %s | FileCheck %s

// CHECK: #[[WMMA:.*]] = #triton_gpu.amd_wmma
// CHECK: #[[$WMMA:.*]] = #triton_gpu.amd_wmma

tt.func @cast_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_f32: f32, %scalar_i64: i64) {
// scalar -> scalar
Expand Down Expand Up @@ -241,14 +241,14 @@ module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ct
// CHECK-LABEL: wmma_layout
tt.func @wmma_layout(%0: tensor<16x16xf16, #blocked>) {
%1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>>
// CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #{{.+}}> -> tensor<16x16xf16, #[[WMMA]]>
// CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #{{.+}}> -> tensor<16x16xf16, #[[$WMMA]]>
tt.return
}

// CHECK-LABEL: wmma_dot_op_layout
tt.func @wmma_dot_op_layout(%0: tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) {
%1 = triton_gpu.convert_layout %0 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.amd_wmma<{warpsPerCTA = [1, 1]}>}>>
// CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #{{.+}}}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[WMMA]]}>>
// CHECK: %{{.+}} = triton_gpu.convert_layout %{{.+}} : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #{{.+}}}>> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[$WMMA]]}>>
tt.return
}
}
14 changes: 7 additions & 7 deletions test/TritonGPU/accelerate-matmul.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-c

// -----

// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [16, 8]}>
// CHECK-80: #[[$MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [16, 8]}>
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
Expand All @@ -60,12 +60,12 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
%arg2: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>) -> tensor<64x128xf32, #blocked1> {
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
%cst_1 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1>
// CHECK-80: tt.dot {{.*}} -> tensor<64x64xf32, #[[MMA]]>
// CHECK-80: tt.dot {{.*}} -> tensor<64x64xf32, #[[$MMA]]>
%d = tt.dot %arg0, %arg1, %cst_0 {allowTF32 = true, maxNumImpreciseAcc = 0 : i32} :
tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x64xf32, #blocked>
%t = arith.truncf %d : tensor<64x64xf32, #blocked> to tensor<64x64xf16, #blocked>
%c = triton_gpu.convert_layout %t : tensor<64x64xf16, #blocked> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
// CHECK-80: tt.dot {{.*}} -> tensor<64x128xf32, #[[MMA]]>
// CHECK-80: tt.dot {{.*}} -> tensor<64x128xf32, #[[$MMA]]>
%r = tt.dot %c, %arg2, %cst_1 {allowTF32 = true, maxNumImpreciseAcc = 0 : i32} :
tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<64x128xf32, #blocked1>
tt.return %r : tensor<64x128xf32, #blocked1>
Expand All @@ -74,7 +74,7 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c

// -----

// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2], instrShape = [16, 8]}>
// CHECK-80: #[[$MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2], instrShape = [16, 8]}>
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
Expand All @@ -85,9 +85,9 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
%arg1: tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>,
%arg2: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>) -> tensor<64x64xf32, #blocked> {
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
// CHECK-80: tt.fp_to_fp {{.*}} : tensor<64x128xf8E4M3B11FNUZ, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 4}>> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 4}>>
// CHECK-80: tt.fp_to_fp {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 4}>> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 4}>>
// CHECK-80: tt.dot {{.*}} -> tensor<64x64xf32, #[[MMA]]>
// CHECK-80: tt.fp_to_fp {{.*}} : tensor<64x128xf8E4M3B11FNUZ, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$MMA]], kWidth = 4}>> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$MMA]], kWidth = 4}>>
// CHECK-80: tt.fp_to_fp {{.*}} : tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #[[$MMA]], kWidth = 4}>> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[$MMA]], kWidth = 4}>>
// CHECK-80: tt.dot {{.*}} -> tensor<64x64xf32, #[[$MMA]]>
%d = tt.dot %arg0, %arg1, %cst_0 {allowTF32 = true, maxNumImpreciseAcc = 0 : i32} :
tensor<64x128xf8E4M3B11FNUZ, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x64xf32, #blocked>
tt.return %d : tensor<64x64xf32, #blocked>
Expand Down
4 changes: 2 additions & 2 deletions test/TritonGPU/loop-pipeline.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -877,12 +877,12 @@ module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-c
} // end module

// -----
// CHECK: #[[SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
// CHECK: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
// CHECK-LABEL: tt.func @indirect_load_shared_layout
// CHECK: scf.for
// CHECK: %[[NEXT_BUFFER_1:.*]] = tt.addptr %{{.*}}, {{.*}}
// CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_1]]
// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !tt.memdesc<1x16xi64, #[[SHARED_LAYOUT]], mutable> -> !tt.memdesc<16xi64, #[[SHARED_LAYOUT]], mutable>
// CHECK: %[[IND_BUFFER_0:.*]] = triton_gpu.memdesc_subview {{.*}} : !tt.memdesc<1x16xi64, #[[$SHARED_LAYOUT]], mutable> -> !tt.memdesc<16xi64, #[[$SHARED_LAYOUT]], mutable>
// CHECK: %[[IND_BUFFER_1:.*]] = triton_gpu.local_load %[[IND_BUFFER_0]]
// CHECK: %[[IND_BUFFER_2:.*]] = tt.expand_dims %[[IND_BUFFER_1]] {axis = 1 : i32}
// CHECK: %[[IND_BUFFER_3:.*]] = tt.broadcast %[[IND_BUFFER_2]]
Expand Down
6 changes: 6 additions & 0 deletions test/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@
config.test_exec_root = os.path.join(config.triton_obj_root, 'test')
config.triton_tools_dir = os.path.join(config.triton_obj_root, 'bin')
config.filecheck_dir = os.path.join(config.triton_obj_root, 'bin', 'FileCheck')

# FileCheck -enable-var-scope is enabled by default in MLIR test
# This option avoids to accidentally reuse variable across -LABEL match,
# it can be explicitly opted-in by prefixing the variable name with $
config.environment["FILECHECK_OPTS"] = "--enable-var-scope"

tool_dirs = [config.triton_tools_dir, config.llvm_tools_dir, config.filecheck_dir]

# Tweak the PATH to include the tools dir.
Expand Down

0 comments on commit e06a990

Please sign in to comment.