From b46a7281fe50a95ead216025ef184c3593d80c6b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 17 Feb 2024 14:05:20 -0800
Subject: [PATCH] Add some initial optimizations for RISC-V

---
 .github/workflows/ci.yml     | 19 +++++--
 CMakeLists.txt               |  1 +
 common_defs.h                |  4 ++
 lib/matchfinder_common.h     | 31 ++++++++++--
 lib/riscv/matchfinder_impl.h | 97 ++++++++++++++++++++++++++++++++++++
 5 files changed, 145 insertions(+), 7 deletions(-)
 create mode 100644 lib/riscv/matchfinder_impl.h

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f26572e2..f758e97a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,18 +22,29 @@ jobs:
       run: $CC -O2 -Wall -Werror lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip
 
   other-arch-build-and-test:
-    name: Build and test (${{ matrix.arch }}, Debian Bullseye, ${{ matrix.compiler }})
+    name: Build and test (${{ matrix.arch }}, ${{ matrix.distro }}, ${{ matrix.compiler }})
     strategy:
       matrix:
-        arch: [armv6, armv7, aarch64, s390x, ppc64le]
-        compiler: [gcc, clang]
+        include:
+        - { arch: armv6, distro: bullseye, compiler: gcc }
+        - { arch: armv6, distro: bullseye, compiler: clang }
+        - { arch: armv7, distro: bullseye, compiler: gcc }
+        - { arch: armv7, distro: bullseye, compiler: clang }
+        - { arch: aarch64, distro: bullseye, compiler: gcc }
+        - { arch: aarch64, distro: bullseye, compiler: clang }
+        - { arch: s390x, distro: bullseye, compiler: gcc }
+        - { arch: s390x, distro: bullseye, compiler: clang }
+        - { arch: ppc64le, distro: bullseye, compiler: gcc }
+        - { arch: ppc64le, distro: bullseye, compiler: clang }
+        - { arch: riscv64, distro: ubuntu_latest, compiler: gcc }
+        - { arch: riscv64, distro: ubuntu_latest, compiler: clang }
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: uraimo/run-on-arch-action@v2.5.0
         with:
           arch: ${{ matrix.arch }}
-          distro: bullseye
+          distro: ${{ matrix.distro }}
           githubToken: ${{ github.token }}
           install: |
             apt-get update
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eaf559a9..917fefb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,7 @@ if(LIBDEFLATE_COMPRESSION_SUPPORT)
          lib/hc_matchfinder.h
          lib/ht_matchfinder.h
          lib/matchfinder_common.h
+         lib/riscv/matchfinder_impl.h
          lib/x86/matchfinder_impl.h
     )
 endif()
diff --git a/common_defs.h b/common_defs.h
index e1bc3fe0..0a155371 100644
--- a/common_defs.h
+++ b/common_defs.h
@@ -65,6 +65,7 @@
 #undef ARCH_X86_32
 #undef ARCH_ARM64
 #undef ARCH_ARM32
+#undef ARCH_RISCV
 #ifdef _MSC_VER
 #  if defined(_M_X64)
 #    define ARCH_X86_64
@@ -84,6 +85,8 @@
 #    define ARCH_ARM64
 #  elif defined(__arm__)
 #    define ARCH_ARM32
+#  elif defined(__riscv)
+#    define ARCH_RISCV
 #  endif
 #endif
 
@@ -374,6 +377,7 @@ static forceinline u64 bswap64(u64 v)
 #if (defined(__GNUC__) || defined(__clang__)) && \
 	(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
 	 defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
+	 defined(__riscv_misaligned_fast) || \
 	 /*
 	  * For all compilation purposes, WebAssembly behaves like any other CPU
 	  * instruction set. Even though WebAssembly engine might be running on
diff --git a/lib/matchfinder_common.h b/lib/matchfinder_common.h
index 48a243e1..dbae9960 100644
--- a/lib/matchfinder_common.h
+++ b/lib/matchfinder_common.h
@@ -51,11 +51,34 @@ typedef s16 mf_pos_t;
 #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
 
 /*
- * Required alignment of the matchfinder buffer pointer and size.  The values
- * here come from the AVX-2 implementation, which is the worst case.
+ * This is the memory address alignment, in bytes, required for the matchfinder
+ * buffers by the architecture-specific implementations of matchfinder_init()
+ * and matchfinder_rebase().  "Matchfinder buffer" means an entire struct
+ * hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of
+ * struct hc_matchfinder; or the child_tab field of struct bt_matchfinder.
+ *
+ * This affects how the entire 'struct deflate_compressor' is allocated, since
+ * the matchfinder structures are embedded inside it.
+ *
+ * Currently the maximum memory address alignment required is 32 bytes, needed
+ * by the AVX-2 matchfinder functions.
  */
 #define MATCHFINDER_MEM_ALIGNMENT	32
-#define MATCHFINDER_SIZE_ALIGNMENT	128
+
+/*
+ * This declares a size, in bytes, that is guaranteed to divide the sizes of the
+ * matchfinder buffers (where "matchfinder buffers" is as defined for
+ * MATCHFINDER_MEM_ALIGNMENT).  The architecture-specific implementations of
+ * matchfinder_init() and matchfinder_rebase() take advantage of this value.
+ *
+ * Currently the maximum size alignment required is 256 bytes, needed by
+ * the AVX-2 matchfinder functions.  However, the RISC-V Vector Extension
+ * matchfinder functions can, in principle, take advantage of a larger size
+ * alignment.  Therefore, we set this to 1024, which still easily divides the
+ * actual sizes that result from the current matchfinder struct definitions.
+ * This value can safely be changed to any power of two that is >= 256.
+ */
+#define MATCHFINDER_SIZE_ALIGNMENT	1024
 
 #undef matchfinder_init
 #undef matchfinder_rebase
@@ -63,6 +86,8 @@ typedef s16 mf_pos_t;
 #  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
 #  if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 #    include "arm/matchfinder_impl.h"
+#  elif defined(ARCH_RISCV)
+#    include "riscv/matchfinder_impl.h"
 #  elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #    include "x86/matchfinder_impl.h"
 #  endif
diff --git a/lib/riscv/matchfinder_impl.h b/lib/riscv/matchfinder_impl.h
new file mode 100644
index 00000000..6d8bf793
--- /dev/null
+++ b/lib/riscv/matchfinder_impl.h
@@ -0,0 +1,97 @@
+/*
+ * riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions
+ *
+ * Copyright 2024 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_RISCV_MATCHFINDER_IMPL_H
+#define LIB_RISCV_MATCHFINDER_IMPL_H
+
+#if defined(ARCH_RISCV) && defined(__riscv_vector)
+#include <riscv_vector.h>
+
+/*
+ * Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V
+ * vector registers and also evenly divide the sizes of the matchfinder buffers.
+ */
+static forceinline size_t
+riscv_matchfinder_vl(void)
+{
+	const size_t vl = __riscv_vsetvlmax_e16m8();
+
+	STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16));
+	/*
+	 * MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the
+	 * RISC-V Vector Extension requires that the vector register length
+	 * (VLEN) be a power of 2.  Thus, a simple MIN() gives the correct
+	 * answer here; rounding to a power of 2 is not required.
+	 */
+	STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT &
+		       (MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0);
+	ASSERT((vl & (vl - 1)) == 0);
+	return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t));
+}
+
+/* matchfinder_init() optimized using the RISC-V Vector Extension */
+static forceinline void
+matchfinder_init_rvv(mf_pos_t *p, size_t size)
+{
+	const size_t vl = riscv_matchfinder_vl();
+	const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl);
+
+	ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
+	do {
+		__riscv_vse16_v_i16m8(p, v, vl);
+		p += vl;
+		size -= vl * sizeof(p[0]);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_rvv
+
+/* matchfinder_rebase() optimized using the RISC-V Vector Extension */
+static forceinline void
+matchfinder_rebase_rvv(mf_pos_t *p, size_t size)
+{
+	const size_t vl = riscv_matchfinder_vl();
+
+	ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
+	do {
+		vint16m8_t v = __riscv_vle16_v_i16m8(p, vl);
+
+		/*
+		 * This should generate the vsadd.vx instruction
+		 * (Vector Saturating Add, integer vector-scalar)
+		 */
+		v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE,
+					   vl);
+		__riscv_vse16_v_i16m8(p, v, vl);
+		p += vl;
+		size -= vl * sizeof(p[0]);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_rvv
+
+#endif /* ARCH_RISCV && __riscv_vector */
+
+#endif /* LIB_RISCV_MATCHFINDER_IMPL_H */