Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More efficient final reduction for CRC32 #399

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 13 additions & 24 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
};
static const u32 _aligned_attribute(16) mask32[4] = {
0, 0, 0xffffffff, 0
};
const uint8x16_t zeroes = vdupq_n_u8(0);
const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
uint8x16_t v0, v1, v2, v3;

Expand Down Expand Up @@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
if (len)
v0 = fold_partial_vec(v0, p, len, multipliers_1);

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/

v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));

/* Fold 96 => 64 bits. */
v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
clmul_low(vandq_u8(v0, mask32),
load_multipliers(final_mults[0])));

/* Reduce 64 => 32 bits using Barrett reduction. */
v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
}
#undef SUFFIX
#undef ATTRIBUTES
Expand Down
4 changes: 1 addition & 3 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }

#define CRC32_NUM_CHUNKS 4
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
Expand Down
111 changes: 54 additions & 57 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
* for a different polynomial, not the gzip one. For an explanation of CRC
* folding with carryless multiplication instructions, see
* scripts/gen_crc32_multipliers.c and the following paper:
* scripts/gen_crc32_multipliers.c and the following blog posts and papers:
*
* "An alternative exposition of crc32_4k_pclmulqdq"
* https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
Expand Down Expand Up @@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
const vec_t mults_2v = MULTS_2V;
const vec_t mults_1v = MULTS_1V;
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
const __m128i barrett_reduction_constants =
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1;
Expand Down Expand Up @@ -389,68 +391,63 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#if USE_AVX512
reduce_x0:
#endif

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
_mm_clmulepi64_si128(x0, mults_128b, 0x10));

/* Fold 96 => 64 bits. */
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
_mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
final_mult, 0x00));

/*
* Reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
* Generate the final n-bit CRC from the 128-bit x0 = A as follows:
*
* We also know that:
* crc = x^n * A mod G
* = x^n * (x^64*A_H + A_L) mod G
* = x^n * (x^(64-n)*(x^n*A_H mod G) + A_L) mod G
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
* I.e.:
* crc := 0
* crc := x^n * (x^(64-n)*crc + A_H) mod G
* crc := x^n * (x^(64-n)*crc + A_L) mod G
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
* A_H and A_L denote the high and low 64 polynomial coefficients in A.
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
* Using Barrett reduction to do the 'mod G', this becomes:
*
* Note that floor(x^64 / G(x)) is a constant.
* crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
* A_L := x^(64-n)*crc + A_L
* crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
*
* So finally we have:
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
* For the gzip crc, n = 32 and the bit order is LSB (least significant
* bit) first. 'm' must be an integer >= 63 (the max degree of A_L and
* A_H) for sufficient precision to be carried through the calculation.
* As the gzip crc is LSB-first we use m == 63, which results in
* floor(x^(m+n) / G) being 64-bit which is the most pclmulqdq can
* accept. The multiplication with floor(x^(63+n) / G) then produces a
* 127-bit product, and the floored division by x^63 just takes the
* first qword.
*/

/* tmp := floor((A_H * floor(x^(63+n) / G)) / x^63) */
x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
/* tmp is in bits [0:64) of x1. */

/* crc := tmp * G mod x^n */
x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
/* crc is in bits [64:64+n) of x1. */

/*
* A_L := x^(64-n)*crc + A_L
* crc is already aligned to add (XOR) it directly to A_L, after
* selecting it using a mask.
*/
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
return _mm_extract_epi32(x0, 1);
#if USE_AVX512
x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
#else
x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
#endif
/*
* crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
* Same as previous but uses the low-order 64 coefficients of A.
*/
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);

/* Extract the CRC from bits [64:64+n) of x0. */
return _mm_extract_epi32(x0, 2);
}

#undef vec_t
Expand Down
40 changes: 21 additions & 19 deletions scripts/gen_crc32_multipliers.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,28 @@ compute_xD_modG(size_t D)
return remainder;
}

/* Compute floor(x^64 / G(x)) */
/* Compute floor(x^95 / G(x)) */
static u64
compute_x64_div_G(void)
compute_x95_div_G(void)
{
/* The quotient, max order 95 - 32 = 63. */
u64 quotient = 0;
u64 dividend = 0x1;

for (int i = 0; i < 64 - 32 + 1; i++) {
if ((dividend >> i) & 1) {
quotient |= (u64)1 << i;
dividend ^= CRCPOLY_FULL << i;
/*
* The x^32 through x^95 terms of the remainder. This starts at x^95
* and is updated through long division. At the end only the
* x^0 through x^31 terms will be nonzero, but those are unneeded.
*/
u64 remainder = 0x1;

for (int i = 0; i < 64; i++) {
/*
* If the x^(95-i) term of remainder is nonzero, add
* x^(63-i) * G(x) to cancel it out. (G(x) has order 32.)
*/
if (remainder & (1ULL << i)) {
quotient |= 1ULL << i;
remainder ^= (u64)CRCPOLY_FULL << i;
}
}

Expand Down Expand Up @@ -123,20 +134,11 @@ gen_vec_folding_constants(void)
printf("\n");
}

/* Multiplier for final 96 => 64 bit fold */
printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
compute_xD_modG(63));

/*
* Constants for final 64 => 32 bit reduction. These constants are the
* odd ones out, as this final reduction step can't use the regular CRC
* folding described above. It uses Barrett reduction instead.
*/
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
compute_x64_div_G());
/* Constants for the final 128 => 32 bit reduction */
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n",
compute_x95_div_G());
printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
CRCPOLY_FULL);
printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
}

/* Multipliers for combining the CRCs of separate chunks */
Expand Down
Loading