Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some initial optimizations for RISC-V #340

Merged
merged 1 commit into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,29 @@ jobs:
run: $CC -O2 -Wall -Werror lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip

other-arch-build-and-test:
name: Build and test (${{ matrix.arch }}, Debian Bullseye, ${{ matrix.compiler }})
name: Build and test (${{ matrix.arch }}, ${{ matrix.distro }}, ${{ matrix.compiler }})
strategy:
matrix:
arch: [armv6, armv7, aarch64, s390x, ppc64le]
compiler: [gcc, clang]
include:
- { arch: armv6, distro: bullseye, compiler: gcc }
- { arch: armv6, distro: bullseye, compiler: clang }
- { arch: armv7, distro: bullseye, compiler: gcc }
- { arch: armv7, distro: bullseye, compiler: clang }
- { arch: aarch64, distro: bullseye, compiler: gcc }
- { arch: aarch64, distro: bullseye, compiler: clang }
- { arch: s390x, distro: bullseye, compiler: gcc }
- { arch: s390x, distro: bullseye, compiler: clang }
- { arch: ppc64le, distro: bullseye, compiler: gcc }
- { arch: ppc64le, distro: bullseye, compiler: clang }
- { arch: riscv64, distro: ubuntu_latest, compiler: gcc }
- { arch: riscv64, distro: ubuntu_latest, compiler: clang }
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: uraimo/run-on-arch-action@v2.5.0
with:
arch: ${{ matrix.arch }}
distro: bullseye
distro: ${{ matrix.distro }}
githubToken: ${{ github.token }}
install: |
apt-get update
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ if(LIBDEFLATE_COMPRESSION_SUPPORT)
lib/hc_matchfinder.h
lib/ht_matchfinder.h
lib/matchfinder_common.h
lib/riscv/matchfinder_impl.h
lib/x86/matchfinder_impl.h
)
endif()
Expand Down
4 changes: 4 additions & 0 deletions common_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
#undef ARCH_X86_32
#undef ARCH_ARM64
#undef ARCH_ARM32
#undef ARCH_RISCV
#ifdef _MSC_VER
# if defined(_M_X64)
# define ARCH_X86_64
Expand All @@ -84,6 +85,8 @@
# define ARCH_ARM64
# elif defined(__arm__)
# define ARCH_ARM32
# elif defined(__riscv)
# define ARCH_RISCV
# endif
#endif

Expand Down Expand Up @@ -374,6 +377,7 @@ static forceinline u64 bswap64(u64 v)
#if (defined(__GNUC__) || defined(__clang__)) && \
(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
defined(__riscv_misaligned_fast) || \
/*
* For all compilation purposes, WebAssembly behaves like any other CPU
* instruction set. Even though WebAssembly engine might be running on
Expand Down
31 changes: 28 additions & 3 deletions lib/matchfinder_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,43 @@ typedef s16 mf_pos_t;
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)

/*
* Required alignment of the matchfinder buffer pointer and size. The values
* here come from the AVX-2 implementation, which is the worst case.
* This is the memory address alignment, in bytes, required for the matchfinder
* buffers by the architecture-specific implementations of matchfinder_init()
* and matchfinder_rebase(). "Matchfinder buffer" means an entire struct
* hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of
* struct hc_matchfinder; or the child_tab field of struct bt_matchfinder.
*
* This affects how the entire 'struct deflate_compressor' is allocated, since
* the matchfinder structures are embedded inside it.
*
* Currently the maximum memory address alignment required is 32 bytes, needed
* by the AVX-2 matchfinder functions.
*/
#define MATCHFINDER_MEM_ALIGNMENT 32
#define MATCHFINDER_SIZE_ALIGNMENT 128

/*
* This declares a size, in bytes, that is guaranteed to divide the sizes of the
* matchfinder buffers (where "matchfinder buffers" is as defined for
* MATCHFINDER_MEM_ALIGNMENT). The architecture-specific implementations of
* matchfinder_init() and matchfinder_rebase() take advantage of this value.
*
* Currently the maximum size alignment required is 256 bytes, needed by
* the AVX-2 matchfinder functions. However, the RISC-V Vector Extension
* matchfinder functions can, in principle, take advantage of a larger size
* alignment. Therefore, we set this to 1024, which still easily divides the
* actual sizes that result from the current matchfinder struct definitions.
* This value can safely be changed to any power of two that is >= 256.
*/
#define MATCHFINDER_SIZE_ALIGNMENT 1024

#undef matchfinder_init
#undef matchfinder_rebase
#ifdef _aligned_attribute
# define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
# if defined(ARCH_ARM32) || defined(ARCH_ARM64)
# include "arm/matchfinder_impl.h"
# elif defined(ARCH_RISCV)
# include "riscv/matchfinder_impl.h"
# elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
# include "x86/matchfinder_impl.h"
# endif
Expand Down
97 changes: 97 additions & 0 deletions lib/riscv/matchfinder_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions
*
* Copyright 2024 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef LIB_RISCV_MATCHFINDER_IMPL_H
#define LIB_RISCV_MATCHFINDER_IMPL_H

#if defined(ARCH_RISCV) && defined(__riscv_vector)
#include <riscv_vector.h>

/*
* Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V
* vector registers and also evenly divide the sizes of the matchfinder buffers.
*/
static forceinline size_t
riscv_matchfinder_vl(void)
{
const size_t vl = __riscv_vsetvlmax_e16m8();

STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16));
/*
* MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the
* RISC-V Vector Extension requires that the vector register length
* (VLEN) be a power of 2. Thus, a simple MIN() gives the correct
* answer here; rounding to a power of 2 is not required.
*/
STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT &
(MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0);
ASSERT((vl & (vl - 1)) == 0);
return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t));
}

/* matchfinder_init() optimized using the RISC-V Vector Extension */
static forceinline void
matchfinder_init_rvv(mf_pos_t *p, size_t size)
{
const size_t vl = riscv_matchfinder_vl();
const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl);

ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
do {
__riscv_vse16_v_i16m8(p, v, vl);
p += vl;
size -= vl * sizeof(p[0]);
} while (size != 0);
}
#define matchfinder_init matchfinder_init_rvv

/* matchfinder_rebase() optimized using the RISC-V Vector Extension */
static forceinline void
matchfinder_rebase_rvv(mf_pos_t *p, size_t size)
{
const size_t vl = riscv_matchfinder_vl();

ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
do {
vint16m8_t v = __riscv_vle16_v_i16m8(p, vl);

/*
* This should generate the vsadd.vx instruction
* (Vector Saturating Add, integer vector-scalar)
*/
v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE,
vl);
__riscv_vse16_v_i16m8(p, v, vl);
p += vl;
size -= vl * sizeof(p[0]);
} while (size != 0);
}
#define matchfinder_rebase matchfinder_rebase_rvv

#endif /* ARCH_RISCV && __riscv_vector */

#endif /* LIB_RISCV_MATCHFINDER_IMPL_H */
Loading