taichi-dev · turbo0628 · May 19, 2022 · May 6, 2022 · May 6, 2022 · May 6, 2022
diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
@@ -97,9 +97,11 @@ def shfl_xor_i32(mask, val, offset):
                               with_runtime_context=False)
 
 
-def match_any():
-    # TODO
-    pass
+def match_any(mask, value):
+    return impl.call_internal("cuda_match_any_sync_i32",
+                              mask,
+                              value,
+                              with_runtime_context=False)
 
 
 def match_all():

diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
@@ -1084,11 +1084,11 @@ int32 cuda_ballot_sync_i32(u32 mask, int32 predicate) {
   return cuda_ballot_sync(mask, (bool)predicate);
 }
 
-i32 cuda_match_any_sync_i32(i32 mask, i32 value) {
+uint32 cuda_match_any_sync_i32(u32 mask, i32 value) {
   return 0;
 }
 
-i32 cuda_match_any_sync_i64(i32 mask, i64 value) {
+uint32 cuda_match_any_sync_i64(u32 mask, i64 value) {
 #if ARCH_cuda
   u32 ret;
   asm volatile("match.any.sync.b64  %0, %1, %2;"

diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
@@ -269,8 +269,25 @@ def foo():
 
 @test_utils.test(arch=ti.cuda)
 def test_match_any():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.i32, shape=32)
+    b = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=32)
+        for i in range(16):
+            a[i] = 0
+            a[i + 16] = 1
+
+        for i in range(32):
+            b[i] = ti.simt.warp.match_any(ti.u32(0xFFFFFFFF), a[i])
+
+    foo()
+
+    for i in range(16):
+        assert b[i] == 65535
+    for i in range(16):
+        assert b[i + 16] == (2**32 - 2**16)
 
 
 @test_utils.test(arch=ti.cuda)