Skip to content

Commit

Permalink
lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32
Browse files Browse the repository at this point in the history
This improves CRC-32 performance on some of the latest x86 CPUs.  Three
implementations are added: VPCLMULQDQ/AVX2 and VPCLMULQDQ/AVX512VL which
use 256-bit vectors, and VPCLMULQDQ/AVX512F/AVX512VL which uses 512-bit
vectors.  To reduce downclocking effects, the implementation with
512-bit vectors isn't used on Intel CPUs 10th generation and older.
  • Loading branch information
ebiggers committed Feb 19, 2024
1 parent b46a728 commit 5f2a0b4
Show file tree
Hide file tree
Showing 11 changed files with 637 additions and 258 deletions.
8 changes: 4 additions & 4 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,12 @@ static u32 ATTRIBUTES MAYBE_UNUSED
crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{
static const u64 _aligned_attribute(16) mults[3][2] = {
CRC32_1VECS_MULTS,
CRC32_4VECS_MULTS,
CRC32_2VECS_MULTS,
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_FINAL_MULT, 0 },
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
};
Expand Down
10 changes: 7 additions & 3 deletions lib/arm/crc32_pmull_wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)

if (len < 3 * 192) {
static const u64 _aligned_attribute(16) mults[3][2] = {
CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
};
poly64x2_t multipliers_4, multipliers_2, multipliers_1;

Expand Down Expand Up @@ -97,8 +99,10 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
v0 = fold_vec(v0, v1, multipliers_1);
} else {
static const u64 _aligned_attribute(16) mults[4][2] = {
CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
{ CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
{ CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */
{ CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
};
const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
Expand Down
2 changes: 1 addition & 1 deletion lib/crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
* intermediate remainder (which we never actually store explicitly) is 96 bits.
*
* On CPUs that support fast carryless multiplication, CRCs can be computed even
* more quickly via "folding". See e.g. the x86 PCLMUL implementation.
* more quickly via "folding". See e.g. the x86 PCLMUL implementations.
*/

#include "lib_common.h"
Expand Down
122 changes: 85 additions & 37 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,103 @@
* THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.
*/

#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */
#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */
#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 }
#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */

#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */
#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */
#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 }
#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */
#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */

#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */
#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */
#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 }
#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */
#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */

#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */
#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */
#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 }
#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */
#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */

#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */
#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */
#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 }
#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */
#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */

#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */
#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */
#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 }
#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */
#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */

#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */
#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */
#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 }
#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */
#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */

#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */
#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */
#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 }
#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */
#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */

#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */
#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */
#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 }
#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */
#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */

#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */
#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */
#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 }
#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */
#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */

#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */
#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */
#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 }
#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */
#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */

#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */
#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */
#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 }
#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */
#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */

#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */
#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */

#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */
#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */

#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */
#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */

#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */
#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */

#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */
#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */

#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */
#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */

#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */
#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */

#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */
#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */

#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */
#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */

#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */
#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */

#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */
#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */

#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */
#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */

#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */
#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */

#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */
#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */

#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */
#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */

#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */
#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */

#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */
#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */

#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */
#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */

#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */
#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */

#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
Expand Down
54 changes: 50 additions & 4 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,32 +86,71 @@ read_xcr(u32 index)

static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_SSE2, "sse2"},
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
{X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"},
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
};

volatile u32 libdeflate_x86_cpu_features = 0;

/*
* Don't use 512-bit vectors on Intel CPUs 10th generation and older, due to the
* downclocking penalty.
*/
static inline bool
allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
{
#ifdef TEST_SUPPORT__DO_NOT_USE
return true;
#endif
if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
return true;
if (family != 6)
return true;
switch (model) {
case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
case 106: /* Ice Lake (Server) */
case 108: /* Ice Lake (Server) */
case 126: /* Ice Lake (Client) */
case 140: /* Tiger Lake */
case 141: /* Tiger Lake */
return false;
}
return true;
}

/* Initialize libdeflate_x86_cpu_features. */
void libdeflate_init_x86_cpu_features(void)
{
u32 max_leaf, a, b, c, d;
u32 max_leaf;
u32 manufacturer[3];
u32 family, model;
u32 a, b, c, d;
u64 xcr0 = 0;
u32 features = 0;

/* EAX=0: Highest Function Parameter and Manufacturer ID */
cpuid(0, 0, &max_leaf, &b, &c, &d);
cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
&manufacturer[1]);
if (max_leaf < 1)
goto out;

/* EAX=1: Processor Info and Feature Bits */
cpuid(1, 0, &a, &b, &c, &d);
family = (a >> 8) & 0xf;
model = (a >> 4) & 0xf;
if (family == 6 || family == 0xf)
model += (a >> 12) & 0xf0;
if (family == 0xf)
family += (a >> 20) & 0xff;
if (d & (1 << 26))
features |= X86_CPU_FEATURE_SSE2;
if (c & (1 << 1))
features |= X86_CPU_FEATURE_PCLMUL;
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (c & (1 << 27))
xcr0 = read_xcr(0);
if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
Expand All @@ -126,6 +165,13 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_AVX2;
if (b & (1 << 8))
features |= X86_CPU_FEATURE_BMI2;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
features |= X86_CPU_FEATURE_AVX512VL;
if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
features |= X86_CPU_FEATURE_VPCLMULQDQ;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
Expand Down
65 changes: 55 additions & 10 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,22 @@
#endif

#define X86_CPU_FEATURE_SSE2 0x00000001
#define X86_CPU_FEATURE_PCLMUL 0x00000002
#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512F 0x00000020
#define X86_CPU_FEATURE_AVX512VL 0x00000040
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -90,18 +96,18 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
#endif
#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)

/* PCLMUL */
/* PCLMULQDQ */
#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
# define HAVE_PCLMUL_NATIVE 1
# define HAVE_PCLMULQDQ_NATIVE 1
#else
# define HAVE_PCLMUL_NATIVE 0
# define HAVE_PCLMULQDQ_NATIVE 0
#endif
#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
(GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
defined(_MSC_VER)))
# define HAVE_PCLMUL_INTRIN 1
#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \
(GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
defined(_MSC_VER)))
# define HAVE_PCLMULQDQ_INTRIN 1
#else
# define HAVE_PCLMUL_INTRIN 0
# define HAVE_PCLMULQDQ_INTRIN 0
#endif

/* AVX */
Expand Down Expand Up @@ -156,6 +162,45 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_BMI2_INTRIN 0
#endif

/* AVX-512F */
#ifdef __AVX512F__
# define HAVE_AVX512F_NATIVE 1
#else
# define HAVE_AVX512F_NATIVE 0
#endif
#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512F_INTRIN 1
#else
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
#else
# define HAVE_AVX512VL_NATIVE 0
#endif
#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512VL_INTRIN 1
#else
# define HAVE_AVX512VL_INTRIN 0
#endif

/* VPCLMULQDQ */
#ifdef __VPCLMULQDQ__
# define HAVE_VPCLMULQDQ_NATIVE 1
#else
# define HAVE_VPCLMULQDQ_NATIVE 0
#endif
#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER))
# define HAVE_VPCLMULQDQ_INTRIN 1
#else
# define HAVE_VPCLMULQDQ_INTRIN 0
#endif

#endif /* ARCH_X86_32 || ARCH_X86_64 */

#endif /* LIB_X86_CPU_FEATURES_H */
Loading

0 comments on commit 5f2a0b4

Please sign in to comment.