From 614e9bf53602d9dc671fbb03f9432e2f6148669c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 26 Nov 2024 20:52:53 -0800
Subject: [PATCH] lib/x86/crc32: fix undefined behavior in VPCLMULQDQ optimized
 functions

The specifications for _mm256_castsi128_si256() and
_mm512_castsi128_si512() are bugged, as they leave the high bits
undefined instead of zeroed as would be expected.  Separate intrinsics
_mm256_zextsi128_si256() and _mm512_zextsi128_si512() were later added
to allow working around this defect.  Use them.

This fixes incorrect CRC checksums produced by
crc32_x86_vpclmulqdq_avx512_vl512() when built with 'clang -O0'.
Other cases are not known to have been affected.

Resolves https://github.com/ebiggers/libdeflate/issues/403
Fixes: 5f2a0b4beca9 ("lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32")
---
 lib/x86/crc32_impl.h            | 10 ++++++++--
 lib/x86/crc32_pclmul_template.h |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index eda7b4cb..c8909cde 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -78,8 +78,10 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  *
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
  * _mm256_clmulepi64_epi128() always needed AVX512.  It's fixed in gcc 8.3.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
-#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
@@ -89,7 +91,7 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  include "crc32_pclmul_template.h"
 #endif
 
-#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 /*
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
@@ -97,6 +99,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * instruction and more registers.  This is used on CPUs that support AVX-512
  * but where using 512-bit vectors causes downclocking.  This should also be the
  * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
@@ -109,6 +113,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
  * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
  * also be the optimal implementation on CPUs that support AVX10/512.
+ *
+ * _mm512_zextsi128_si512() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index 09abb515..df804a29 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -80,7 +80,7 @@
 #  define fold_vec		fold_vec256
 #  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
 #  define VXOR(a, b)		_mm256_xor_si256((a), (b))
-#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define M128I_TO_VEC(a)	_mm256_zextsi128_si256(a)
 #  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
 #  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -91,7 +91,7 @@
 #  define fold_vec		fold_vec512
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
 #  define VXOR(a, b)		_mm512_xor_si512((a), (b))
-#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define M128I_TO_VEC(a)	_mm512_zextsi128_si512(a)
 #  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
 #  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)