Skip to content

Commit

Permalink
Improve large buffer crc32c performance on Arm [2/3]
Browse files Browse the repository at this point in the history
Summary:
Apply Folly changes to 3rd-party imported code and prepare for integration.

Conditionalizs build on appropriate compiler environment; remove implicit inversion; change function signature to match existing Folly conventions.

Add a helper function, compiled with basic ISA support, to detect at runtime whether this code is safe to use.

Reviewed By: skrueger

Differential Revision: D59321461

fbshipit-source-id: 8ae9226d769810a7366e2f81c857fcfcb08fafed
  • Loading branch information
Michael van der Westhuizen authored and facebook-github-bot committed Sep 29, 2024
1 parent 7b48184 commit e7499ed
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 15 deletions.
38 changes: 38 additions & 0 deletions folly/external/fast-crc32/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,41 @@ cpp_library(
"//folly:portability",
],
)

cpp_library(
name = "neon_crc32c_v3s4x2e_v2",
srcs = [
"neon_crc32c_v3s4x2e_v2.cpp",
],
headers = [
"neon_crc32c_v3s4x2e_v2.h",
],
arch_preprocessor_flags = {
"aarch64": [
"-march=armv8-a+crypto+crc",
],
},
deps = [
"//folly:portability",
"//folly/system:aux_vector",
],
)

cpp_library(
name = "neon_eor3_crc32c_v8s2x4_s3",
srcs = [
"neon_eor3_crc32c_v8s2x4_s3.cpp",
],
headers = [
"neon_eor3_crc32c_v8s2x4_s3.h",
],
arch_preprocessor_flags = {
"aarch64": [
"-march=armv8-a+crypto+crc+sha3",
],
},
deps = [
"//folly:portability",
"//folly/system:aux_vector",
],
)
42 changes: 34 additions & 8 deletions folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,27 @@
/* ./generate -i neon -p crc32c -a v3s4x2e_v2 */
/* MIT licensed */

#if 0
#include "folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h"
#include <folly/system/AuxVector.h> // @manual
#include <folly/Portability.h>

#include <stddef.h>
#include <stdint.h>

#define CRC_EXPORT extern

#if !(FOLLY_AARCH64 && FOLLY_NEON && FOLLY_ARM_FEATURE_CRYPTO && FOLLY_ARM_FEATURE_CRC32)
#include <stdlib.h>
namespace folly::detail {
CRC_EXPORT uint32_t neon_crc32c_v3s4x2e_v2(const uint8_t*, size_t, uint32_t) {
abort(); // not implemented on this platform
}

CRC_EXPORT bool has_neon_crc32c_v3s4x2e_v2() {
return false;
}
}
#else
#include <arm_acle.h>
#include <arm_neon.h>

Expand All @@ -15,8 +33,8 @@
#define CRC_AINLINE static __inline __attribute__((always_inline))
#define CRC_ALIGN(n) __attribute__((aligned(n)))
#endif
#define CRC_EXPORT extern

namespace folly::detail {
CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
uint64x2_t r;
__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
Expand Down Expand Up @@ -58,8 +76,15 @@ CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
}

CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
crc0 = ~crc0;
FOLLY_TARGET_ATTRIBUTE("crc")
CRC_EXPORT bool has_neon_crc32c_v3s4x2e_v2() {
static ElfHwCaps caps;

return caps.aarch64_fp() && caps.aarch64_asimd() && caps.aarch64_pmull() &&
caps.aarch64_crc32();
}

CRC_EXPORT uint32_t neon_crc32c_v3s4x2e_v2(const uint8_t* buf, size_t len, uint32_t crc0) {
for (; len && ((uintptr_t)buf & 7); --len) {
crc0 = __crc32cb(crc0, *buf++);
}
Expand All @@ -69,11 +94,11 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
len -= 8;
}
if (len >= 112) {
const char* end = buf + len;
const uint8_t* end = buf + len;
size_t blk = (len - 0) / 112;
size_t klen = blk * 16;
const char* buf2 = buf + klen * 4;
const char* limit = buf + klen - 32;
const uint8_t* buf2 = buf + klen * 4;
const uint8_t* limit = buf + klen - 32;
uint32_t crc1 = 0;
uint32_t crc2 = 0;
uint32_t crc3 = 0;
Expand Down Expand Up @@ -159,6 +184,7 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
for (; len; --len) {
crc0 = __crc32cb(crc0, *buf++);
}
return ~crc0;
return crc0;
}
} // namespace folly::detail
#endif
8 changes: 8 additions & 0 deletions folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once
#include <cstddef>
#include <cstdint>

namespace folly::detail {
uint32_t neon_crc32c_v3s4x2e_v2(const uint8_t* buf, size_t len, uint32_t crc0);
bool has_neon_crc32c_v3s4x2e_v2();
}
40 changes: 33 additions & 7 deletions folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,27 @@
/* ./generate -i neon_eor3 -p crc32c -a v8s2x4_s3 */
/* MIT licensed */

#if 0
#include <stddef.h>
#include "folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h"
#include <folly/system/AuxVector.h> // @manual
#include <folly/Portability.h>

#include <stdint.h>
#include <stddef.h>

#define CRC_EXPORT extern

#if !(FOLLY_AARCH64 && FOLLY_NEON && FOLLY_ARM_FEATURE_CRYPTO && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_SHA3)
#include <stdlib.h>
namespace folly::detail {
CRC_EXPORT uint32_t neon_eor3_crc32c_v8s2x4_s3(const uint8_t*, size_t, uint32_t) {
abort(); // not implemented on this platform
}

CRC_EXPORT bool has_neon_eor3_crc32c_v8s2x4_s3() {
return false;
}
}
#else
#include <arm_acle.h>
#include <arm_neon.h>

Expand All @@ -15,8 +33,8 @@
#define CRC_AINLINE static __inline __attribute__((always_inline))
#define CRC_ALIGN(n) __attribute__((aligned(n)))
#endif
#define CRC_EXPORT extern

namespace folly::detail {
CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
uint64x2_t r;
__asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b));
Expand Down Expand Up @@ -58,8 +76,15 @@ CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
}

CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
crc0 = ~crc0;
FOLLY_TARGET_ATTRIBUTE("crc")
CRC_EXPORT bool has_neon_eor3_crc32c_v8s2x4_s3() {
static ElfHwCaps caps;

return caps.aarch64_fp() && caps.aarch64_asimd() && caps.aarch64_pmull() &&
caps.aarch64_crc32() && caps.aarch64_sha3();
}

CRC_EXPORT uint32_t neon_eor3_crc32c_v8s2x4_s3(const uint8_t* buf, size_t len, uint32_t crc0) {
for (; len && ((uintptr_t)buf & 7); --len) {
crc0 = __crc32cb(crc0, *buf++);
}
Expand All @@ -71,7 +96,7 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
if (len >= 192) {
size_t blk = (len - 0) / 192;
size_t klen = blk * 32;
const char* buf2 = buf + klen * 2;
const uint8_t* buf2 = buf + klen * 2;
uint32_t crc1 = 0;
uint64x2_t vc0;
uint64x2_t vc1;
Expand Down Expand Up @@ -184,6 +209,7 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
for (; len; --len) {
crc0 = __crc32cb(crc0, *buf++);
}
return ~crc0;
return crc0;
}
} // namespace folly::detail
#endif
8 changes: 8 additions & 0 deletions folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once
#include <cstddef>
#include <cstdint>

namespace folly::detail {
uint32_t neon_eor3_crc32c_v8s2x4_s3(const uint8_t* buf, size_t len, uint32_t crc0);
bool has_neon_eor3_crc32c_v8s2x4_s3();
}

0 comments on commit e7499ed

Please sign in to comment.