Skip to content

Commit

Permalink
More efficient final reduction for CRC32
Browse files Browse the repository at this point in the history
  • Loading branch information
ebiggers committed Nov 18, 2024
1 parent 48d0c47 commit dad2871
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 103 deletions.
37 changes: 13 additions & 24 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
};
static const u32 _aligned_attribute(16) mask32[4] = {
0, 0, 0xffffffff, 0
};
const uint8x16_t zeroes = vdupq_n_u8(0);
const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
uint8x16_t v0, v1, v2, v3;

Expand Down Expand Up @@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
if (len)
v0 = fold_partial_vec(v0, p, len, multipliers_1);

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/

v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));

/* Fold 96 => 64 bits. */
v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
clmul_low(vandq_u8(v0, mask32),
load_multipliers(final_mults[0])));

/* Reduce 64 => 32 bits using Barrett reduction. */
v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
}
#undef SUFFIX
#undef ATTRIBUTES
Expand Down
4 changes: 1 addition & 3 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }

#define CRC32_NUM_CHUNKS 4
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
Expand Down
111 changes: 54 additions & 57 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
* for a different polynomial, not the gzip one. For an explanation of CRC
* folding with carryless multiplication instructions, see
* scripts/gen_crc32_multipliers.c and the following paper:
* scripts/gen_crc32_multipliers.c and the following blog posts and papers:
*
* "An alternative exposition of crc32_4k_pclmulqdq"
* https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
Expand Down Expand Up @@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
const vec_t mults_2v = MULTS_2V;
const vec_t mults_1v = MULTS_1V;
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
const __m128i barrett_reduction_constants =
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1;
Expand Down Expand Up @@ -389,68 +391,63 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#if USE_AVX512
reduce_x0:
#endif

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
_mm_clmulepi64_si128(x0, mults_128b, 0x10));

/* Fold 96 => 64 bits. */
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
_mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
final_mult, 0x00));

/*
* Reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
* Generate the final n-bit CRC from the 128-bit x0 = A as follows:
*
* We also know that:
* crc = x^n * A mod G
* = x^n * (x^64*A_H + A_L) mod G
* = x^n * (x^(64-n)*(x^n*A_H mod G) + A_L) mod G
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
* I.e.:
* crc := 0
* crc := x^n * (x^(64-n)*crc + A_H) mod G
* crc := x^n * (x^(64-n)*crc + A_L) mod G
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
* A_H and A_L denote the high and low 64 polynomial coefficients in A.
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
* Using Barrett reduction to do the 'mod G', this becomes:
*
* Note that floor(x^64 / G(x)) is a constant.
* crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
* A_L := x^(64-n)*crc + A_L
* crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
*
* So finally we have:
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
* For the gzip crc, n = 32 and the bit order is LSB (least significant
* bit) first. 'm' must be an integer >= 63 (the max degree of A_L and
* A_H) for sufficient precision to be carried through the calculation.
* As the gzip crc is LSB-first we use m == 63, which results in
* floor(x^(m+n) / G) being 64-bit which is the most pclmulqdq can
* accept. The multiplication with floor(x^(63+n) / G) then produces a
* 127-bit product, and the floored division by x^63 just takes the
* first qword.
*/

/* tmp := floor((A_H * floor(x^(63+n) / G)) / x^63) */
x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
/* tmp is in bits [0:64) of x1. */

/* crc := tmp * G mod x^n */
x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
/* crc is in bits [64:64+n) of x1. */

/*
* A_L := x^(64-n)*crc + A_L
* crc is already aligned to add (XOR) it directly to A_L, after
* selecting it using a mask.
*/
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
return _mm_extract_epi32(x0, 1);
#if USE_AVX512
x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
#else
x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
#endif
/*
* crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
* Same as previous but uses the low-order 64 coefficients of A.
*/
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);

/* Extract the CRC from bits [64:64+n) of x0. */
return _mm_extract_epi32(x0, 2);
}

#undef vec_t
Expand Down
40 changes: 21 additions & 19 deletions scripts/gen_crc32_multipliers.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,28 @@ compute_xD_modG(size_t D)
return remainder;
}

/* Compute floor(x^64 / G(x)) */
/* Compute floor(x^95 / G(x)) */
static u64
compute_x64_div_G(void)
compute_x95_div_G(void)
{
/* The quotient, max order 95 - 32 = 63. */
u64 quotient = 0;
u64 dividend = 0x1;

for (int i = 0; i < 64 - 32 + 1; i++) {
if ((dividend >> i) & 1) {
quotient |= (u64)1 << i;
dividend ^= CRCPOLY_FULL << i;
/*
* The x^32 through x^95 terms of the remainder. This starts at x^95
* and is updated through long division. At the end only the
* x^0 through x^31 terms will be nonzero, but those are unneeded.
*/
u64 remainder = 0x1;

for (int i = 0; i < 64; i++) {
/*
* If the x^(95-i) term of remainder is nonzero, add
* x^(63-i) * G(x) to cancel it out. (G(x) has order 32.)
*/
if (remainder & (1ULL << i)) {
quotient |= 1ULL << i;
remainder ^= (u64)CRCPOLY_FULL << i;
}
}

Expand Down Expand Up @@ -123,20 +134,11 @@ gen_vec_folding_constants(void)
printf("\n");
}

/* Multiplier for final 96 => 64 bit fold */
printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
compute_xD_modG(63));

/*
* Constants for final 64 => 32 bit reduction. These constants are the
* odd ones out, as this final reduction step can't use the regular CRC
* folding described above. It uses Barrett reduction instead.
*/
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
compute_x64_div_G());
/* Constants for the final 128 => 32 bit reduction */
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n",
compute_x95_div_G());
printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
CRCPOLY_FULL);
printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
}

/* Multipliers for combining the CRCs of separate chunks */
Expand Down

0 comments on commit dad2871

Please sign in to comment.