VectorCamp · markos · Jan 21, 2022 · Jan 25, 2021 · Sep 23, 2021 · Sep 23, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/Jenkinsfile b/Jenkinsfile
diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
@@ -191,6 +191,34 @@ int main(){
             );
         }
 
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Vermicelli", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
         for (size_t i = 0; i < std::size(sizes); i++) {
             //we imitate the noodle unit tests
             std::string str;

diff --git a/benchmarks/benchmarks.hpp b/benchmarks/benchmarks.hpp
@@ -30,6 +30,7 @@
 #include "nfa/shufticompile.h"
 #include "nfa/truffle.h"
 #include "nfa/trufflecompile.h"
+#include "nfa/vermicelli.hpp"
 #include "hwlm/noodle_build.h"
 #include "hwlm/noodle_engine.h"
 #include "hwlm/noodle_internal.h"

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
@@ -9,6 +9,9 @@ elseif (HAVE_C_INTRIN_H)
 elseif (HAVE_C_ARM_NEON_H)
     set (INTRIN_INC_H "arm_neon.h")
     set (FAT_RUNTIME OFF)
+elseif (HAVE_C_PPC64EL_ALTIVEC_H)
+    set (INTRIN_INC_H "altivec.h")
+    set (FAT_RUNTIME OFF)
 else()
     message (FATAL_ERROR "No intrinsics header found")
 endif ()
@@ -85,7 +88,7 @@ if (FAT_RUNTIME)
             set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
         endif (BUILD_AVX512VBMI)
     elseif (BUILD_AVX2)
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx")
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2")
     elseif ()
         set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3")
     endif ()
@@ -95,12 +98,12 @@ else (NOT FAT_RUNTIME)
 endif ()
 
 if (ARCH_IA32 OR ARCH_X86_64)
-    # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
+    # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
     __m128i a = _mm_set1_epi8(1);
     (void)_mm_shuffle_epi8(a, a);
-}" HAVE_SSSE3)
+}" HAVE_SSE42)
 
     # now look for AVX2
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@@ -136,13 +139,26 @@ int main(){
     (void)_mm512_permutexvar_epi8(idx, a);
 }" HAVE_AVX512VBMI)
 
-elseif (!ARCH_ARM32 AND !ARCH_AARCH64)
+
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+int main() {
+    int32x4_t a = vdupq_n_s32(1);
+    (void)a;
+}" HAVE_NEON)
+elseif (ARCH_PPC64EL)
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+int main() {
+    vector int a = vec_splat_s32(1);
+    (void)a;
+}" HAVE_VSX)
+else ()
     message (FATAL_ERROR "Unsupported architecture")
 endif ()
 
 if (FAT_RUNTIME)
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
     endif ()
     if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2)
         message(FATAL_ERROR "AVX2 support required to build fat runtime")
@@ -163,12 +179,16 @@ else (NOT FAT_RUNTIME)
     if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI)
         message(STATUS "Building without AVX512VBMI support")
     endif ()
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
     endif ()
     if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
         message(FATAL_ERROR "NEON support required for ARM support")
     endif ()
+    if (ARCH_PPPC64EL AND NOT HAVE_VSX)
+        message(FATAL_ERROR "VSX support required for Power support")
+    endif ()
+
 endif ()
 
 unset (PREV_FLAGS)

diff --git a/cmake/config.h.in b/cmake/config.h.in
@@ -21,6 +21,9 @@
 /* "Define if building for AARCH64" */
 #cmakedefine ARCH_AARCH64
 
+/* "Define if building for PPC64EL" */
+#cmakedefine ARCH_PPC64EL
+
 /* "Define if cross compiling for AARCH64" */
 #cmakedefine CROSS_COMPILE_AARCH64
 
@@ -75,6 +78,9 @@
 /* C compiler has arm_sve.h */
 #cmakedefine HAVE_C_ARM_SVE_H
 
+/* C compiler has arm_neon.h */
+#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
+
 /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
    0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

diff --git a/cmake/platform.cmake b/cmake/platform.cmake
@@ -1,3 +1,8 @@
+# determine compiler
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_COMPILER_IS_CLANG TRUE)
+endif()
+
 # determine the target arch
 
 if (CROSS_COMPILE_AARCH64)
@@ -7,15 +12,13 @@ if (CROSS_COMPILE_AARCH64)
 else()
   # really only interested in the preprocessor here
   CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
-
   CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
-
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
-
-  if (ARCH_X86_64 OR ARCH_AARCH64)
+  CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
+  if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
     set(ARCH_64_BIT TRUE)
   else()
     set(ARCH_32_BIT TRUE)
   endif()
-endif()
+endif()
diff --git a/examples/patbench.cc b/examples/patbench.cc
@@ -112,6 +112,7 @@
  *
  */
 
+#include <random>
 #include <algorithm>
 #include <cstring>
 #include <chrono>
@@ -151,6 +152,8 @@ using std::set;
 using std::min;
 using std::max;
 using std::copy;
+using std::random_device;
+using std::mt19937;
 
 enum Criterion {
     CRITERION_THROUGHPUT,
@@ -731,7 +734,9 @@ int main(int argc, char **argv) {
             count++;
             cout << "." << std::flush;
             vector<unsigned> sv(s.begin(), s.end());
-            random_shuffle(sv.begin(), sv.end());
+            random_device rng;
+            mt19937 urng(rng());
+            shuffle(sv.begin(), sv.end(), urng);
             unsigned groups = factor_max + 1;
             for (unsigned current_group = 0; current_group < groups;
                  current_group++) {

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
@@ -893,10 +893,10 @@ do {                                                                          \
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
     if (unlikely(diff128(var, ones128()))) {                                \
-        u64a __attribute__((aligned(16))) vector[2];                        \
-        store128(vector, var);                                              \
-        u64a lo = vector[0];                                                \
-        u64a hi = vector[1];                                                \
+        u64a __attribute__((aligned(16))) vec[2];                           \
+        store128(vec, var);                                                 \
+        u64a lo = vec[0];                                                   \
+        u64a hi = vec[1];                                                   \
         CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
         CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
     }                                                                       \

diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
@@ -44,5 +44,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
     }
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     return HS_SUCCESS;
+#elif defined(ARCH_PPC64EL)
+    return HS_SUCCESS;    
 #endif
 }
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
@@ -39,7 +39,7 @@
 #include "nfa/accel.h"
 #include "nfa/shufti.h"
 #include "nfa/truffle.h"
-#include "nfa/vermicelli.h"
+#include "nfa/vermicelli.hpp"
 #include <string.h>
 
 #define MIN_ACCEL_LEN_BLOCK  16

diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
@@ -30,26 +30,7 @@
 /* SIMD engine agnostic noodle scan parts */
 
 #include "util/supervector/supervector.hpp"
-
-static u8 CASEMASK[] = { 0xff, 0xdf };
-
-static really_inline
-u8 caseClear8(u8 x, bool noCase)
-{
-    return static_cast<u8>(x & CASEMASK[(u8)noCase]);
-}
-
-template<uint16_t S>
-static really_inline SuperVector<S> getMask(u8 c, bool noCase) {
-    u8 k = caseClear8(c, noCase);
-    return SuperVector<S>(k);
-}
-
-template<uint16_t S>
-static really_inline SuperVector<S> getCaseMask(void) {
-    return SuperVector<S>(CASEMASK[1]);
-}
-
+#include "util/supervector/casemask.hpp"
 
 static really_really_inline
 hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,

diff --git a/src/hwlm/noodle_engine_sve.hpp b/src/hwlm/noodle_engine_sve.hpp
@@ -170,7 +170,7 @@ hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
     svbool_t pg = svwhilelt_b8_s64(0, e - d);
     svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1);
     svbool_t matched, matched_rot;
-    svbool_t any = doubleMatched(chars, d, pg, pg_rot, &matched, &matched_rot);
+    svbool_t any = doubleMatched(svreinterpret_u16(chars), d, pg, pg_rot, &matched, &matched_rot);
     return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
 }
 
@@ -187,7 +187,7 @@ hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
     for (size_t i = 0; i < loops; i++, d += svcntb()) {
         DEBUG_PRINTF("d %p \n", d);
         svbool_t matched, matched_rot;
-        svbool_t any = doubleMatched(chars, d, svptrue_b8(), svptrue_b8(),
+        svbool_t any = doubleMatched(svreinterpret_u16(chars), d, svptrue_b8(), svptrue_b8(),
                                      &matched, &matched_rot);
         hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
                                              matched, matched_rot, any);
@@ -220,7 +220,7 @@ hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
     }
     ++d;
 
-    svuint16_t chars = getCharMaskDouble(n->key0, n->key1, noCase);
+    svuint8_t chars = svreinterpret_u8(getCharMaskDouble(n->key0, n->key1, noCase));
 
     if (scan_len <= svcntb()) {
         return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
@@ -234,4 +234,4 @@ hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
         RETURN_IF_TERMINATED(rv);
     }
     return scanDoubleLoop(n, buf, len, cbi, chars, d1, e);
-}
+}
diff --git a/src/nfa/accel.c b/src/nfa/accel.c
@@ -30,7 +30,7 @@
 #include "accel.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "ue2common.h"
 
 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {

diff --git a/src/nfa/arm/shufti.hpp b/src/nfa/arm/shufti.hpp
@@ -1,7 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
  * Copyright (c) 2020-2021, VectorCamp PC
- * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +29,6 @@
 
 /** \file
  * \brief Shufti: character class acceleration.
- *
  */
 
 template <uint16_t S>
@@ -73,4 +71,4 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
     t.print8("t");
 
     return !t.eq(SuperVector<S>::Ones());
-}
+}