From b46a7281fe50a95ead216025ef184c3593d80c6b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 17 Feb 2024 14:05:20 -0800 Subject: [PATCH] Add some initial optimizations for RISC-V --- .github/workflows/ci.yml | 19 +++++-- CMakeLists.txt | 1 + common_defs.h | 4 ++ lib/matchfinder_common.h | 31 ++++++++++-- lib/riscv/matchfinder_impl.h | 97 ++++++++++++++++++++++++++++++++++++ 5 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 lib/riscv/matchfinder_impl.h diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f26572e2..f758e97a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,18 +22,29 @@ jobs: run: $CC -O2 -Wall -Werror lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip other-arch-build-and-test: - name: Build and test (${{ matrix.arch }}, Debian Bullseye, ${{ matrix.compiler }}) + name: Build and test (${{ matrix.arch }}, ${{ matrix.distro }}, ${{ matrix.compiler }}) strategy: matrix: - arch: [armv6, armv7, aarch64, s390x, ppc64le] - compiler: [gcc, clang] + include: + - { arch: armv6, distro: bullseye, compiler: gcc } + - { arch: armv6, distro: bullseye, compiler: clang } + - { arch: armv7, distro: bullseye, compiler: gcc } + - { arch: armv7, distro: bullseye, compiler: clang } + - { arch: aarch64, distro: bullseye, compiler: gcc } + - { arch: aarch64, distro: bullseye, compiler: clang } + - { arch: s390x, distro: bullseye, compiler: gcc } + - { arch: s390x, distro: bullseye, compiler: clang } + - { arch: ppc64le, distro: bullseye, compiler: gcc } + - { arch: ppc64le, distro: bullseye, compiler: clang } + - { arch: riscv64, distro: ubuntu_latest, compiler: gcc } + - { arch: riscv64, distro: ubuntu_latest, compiler: clang } runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: uraimo/run-on-arch-action@v2.5.0 with: arch: ${{ matrix.arch }} - distro: bullseye + distro: ${{ matrix.distro }} githubToken: ${{ github.token }} install: | apt-get update diff --git a/CMakeLists.txt b/CMakeLists.txt index eaf559a9..917fefb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,7 @@ if(LIBDEFLATE_COMPRESSION_SUPPORT) lib/hc_matchfinder.h lib/ht_matchfinder.h lib/matchfinder_common.h + lib/riscv/matchfinder_impl.h lib/x86/matchfinder_impl.h ) endif() diff --git a/common_defs.h b/common_defs.h index e1bc3fe0..0a155371 100644 --- a/common_defs.h +++ b/common_defs.h @@ -65,6 +65,7 @@ #undef ARCH_X86_32 #undef ARCH_ARM64 #undef ARCH_ARM32 +#undef ARCH_RISCV #ifdef _MSC_VER # if defined(_M_X64) # define ARCH_X86_64 @@ -84,6 +85,8 @@ # define ARCH_ARM64 # elif defined(__arm__) # define ARCH_ARM32 +# elif defined(__riscv) +# define ARCH_RISCV # endif #endif @@ -374,6 +377,7 @@ static forceinline u64 bswap64(u64 v) #if (defined(__GNUC__) || defined(__clang__)) && \ (defined(ARCH_X86_64) || defined(ARCH_X86_32) || \ defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ + defined(__riscv_misaligned_fast) || \ /* * For all compilation purposes, WebAssembly behaves like any other CPU * instruction set. Even though WebAssembly engine might be running on diff --git a/lib/matchfinder_common.h b/lib/matchfinder_common.h index 48a243e1..dbae9960 100644 --- a/lib/matchfinder_common.h +++ b/lib/matchfinder_common.h @@ -51,11 +51,34 @@ typedef s16 mf_pos_t; #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) /* - * Required alignment of the matchfinder buffer pointer and size. The values - * here come from the AVX-2 implementation, which is the worst case. + * This is the memory address alignment, in bytes, required for the matchfinder + * buffers by the architecture-specific implementations of matchfinder_init() + * and matchfinder_rebase(). "Matchfinder buffer" means an entire struct + * hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of + * struct hc_matchfinder; or the child_tab field of struct bt_matchfinder. + * + * This affects how the entire 'struct deflate_compressor' is allocated, since + * the matchfinder structures are embedded inside it. + * + * Currently the maximum memory address alignment required is 32 bytes, needed + * by the AVX-2 matchfinder functions. */ #define MATCHFINDER_MEM_ALIGNMENT 32 -#define MATCHFINDER_SIZE_ALIGNMENT 128 + +/* + * This declares a size, in bytes, that is guaranteed to divide the sizes of the + * matchfinder buffers (where "matchfinder buffers" is as defined for + * MATCHFINDER_MEM_ALIGNMENT). The architecture-specific implementations of + * matchfinder_init() and matchfinder_rebase() take advantage of this value. + * + * Currently the maximum size alignment required is 256 bytes, needed by + * the AVX-2 matchfinder functions. However, the RISC-V Vector Extension + * matchfinder functions can, in principle, take advantage of a larger size + * alignment. Therefore, we set this to 1024, which still easily divides the + * actual sizes that result from the current matchfinder struct definitions. + * This value can safely be changed to any power of two that is >= 256. + */ +#define MATCHFINDER_SIZE_ALIGNMENT 1024 #undef matchfinder_init #undef matchfinder_rebase @@ -63,6 +86,8 @@ typedef s16 mf_pos_t; # define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) # if defined(ARCH_ARM32) || defined(ARCH_ARM64) # include "arm/matchfinder_impl.h" +# elif defined(ARCH_RISCV) +# include "riscv/matchfinder_impl.h" # elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/matchfinder_impl.h" # endif diff --git a/lib/riscv/matchfinder_impl.h b/lib/riscv/matchfinder_impl.h new file mode 100644 index 00000000..6d8bf793 --- /dev/null +++ b/lib/riscv/matchfinder_impl.h @@ -0,0 +1,97 @@ +/* + * riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions + * + * Copyright 2024 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_RISCV_MATCHFINDER_IMPL_H +#define LIB_RISCV_MATCHFINDER_IMPL_H + +#if defined(ARCH_RISCV) && defined(__riscv_vector) +#include + +/* + * Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V + * vector registers and also evenly divide the sizes of the matchfinder buffers. + */ +static forceinline size_t +riscv_matchfinder_vl(void) +{ + const size_t vl = __riscv_vsetvlmax_e16m8(); + + STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16)); + /* + * MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the + * RISC-V Vector Extension requires that the vector register length + * (VLEN) be a power of 2. Thus, a simple MIN() gives the correct + * answer here; rounding to a power of 2 is not required. + */ + STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT & + (MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0); + ASSERT((vl & (vl - 1)) == 0); + return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t)); +} + +/* matchfinder_init() optimized using the RISC-V Vector Extension */ +static forceinline void +matchfinder_init_rvv(mf_pos_t *p, size_t size) +{ + const size_t vl = riscv_matchfinder_vl(); + const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl); + + ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); + do { + __riscv_vse16_v_i16m8(p, v, vl); + p += vl; + size -= vl * sizeof(p[0]); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_rvv + +/* matchfinder_rebase() optimized using the RISC-V Vector Extension */ +static forceinline void +matchfinder_rebase_rvv(mf_pos_t *p, size_t size) +{ + const size_t vl = riscv_matchfinder_vl(); + + ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); + do { + vint16m8_t v = __riscv_vle16_v_i16m8(p, vl); + + /* + * This should generate the vsadd.vx instruction + * (Vector Saturating Add, integer vector-scalar) + */ + v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE, + vl); + __riscv_vse16_v_i16m8(p, v, vl); + p += vl; + size -= vl * sizeof(p[0]); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_rvv + +#endif /* ARCH_RISCV && __riscv_vector */ + +#endif /* LIB_RISCV_MATCHFINDER_IMPL_H */