Skip to content

Commit

Permalink
Added x86 SIMD optimizations to crypto datatypes.
Browse files Browse the repository at this point in the history
- The v128 operations are optimized for SSE2/SSSE3/SSE4.1.
- srtp_octet_string_is_eq is optimized for SSE2. When SSE2 is not
  available, use a pair of 32-bit accumulators to speed up the
  bulk of the operation. We use two accumulators to leverage
  instruction-level parallelism supported by most modern CPUs.
- In srtp_cleanse, use memset and ensure it is not optimized away
  with a dummy asm statement, which can potentially consume the
  contents of the memory.
- Endian conversion functions use gcc-style intrinsics, when possible.
- In base64_block_to_octet_triple, prefer memchr to strchr as
  it explicitly accepts the string length, which is known at compile
  time.

The SIMD code uses intrinsics, which are available on all modern compilers.
For MSVC, config_in_cmake.h is modified to define gcc/clang-style SSE macros
based on MSVC predefined macros. We enable all SSE versions when it
indicates that AVX is enabled. SSE2 is always enabled for x86-64 or for x86
when SSE2 FP math is enabled.
  • Loading branch information
Lastique committed Sep 21, 2020
1 parent 7d351de commit f7f31d5
Show file tree
Hide file tree
Showing 3 changed files with 316 additions and 12 deletions.
11 changes: 11 additions & 0 deletions config_in_cmake.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,14 @@
#define inline
#endif
#endif

/* Define gcc/clang-style SSE macros on compilers that don't define them (primarilly, MSVC). */
#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
#define __SSE2__
#endif
#if !defined(__SSSE3__) && defined(__AVX__)
#define __SSSE3__
#endif
#if !defined(__SSE4_1__) && defined(__AVX__)
#define __SSE4_1__
#endif
72 changes: 67 additions & 5 deletions crypto/include/datatypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
#error "Platform not recognized"
#endif

#if defined(__SSE2__)
#include <smmintrin.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -137,6 +141,62 @@ void v128_right_shift(v128_t *x, int shift_index);
* (and the compiler provides better warnings).
*/

#if defined(__SSE2__)

#define _v128_set_to_zero(x) \
(_mm_storeu_si128((__m128i *)(x), _mm_setzero_si128()))

#define _v128_copy(x, y) \
(_mm_storeu_si128((__m128i *)(x), _mm_loadu_si128((const __m128i *)(y))))

#define _v128_xor(z, x, y) \
(_mm_storeu_si128((__m128i *)(z), \
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(y)))))

#define _v128_and(z, x, y) \
(_mm_storeu_si128((__m128i *)(z), \
_mm_and_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(y)))))

#define _v128_or(z, x, y) \
(_mm_storeu_si128((__m128i *)(z), \
_mm_or_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(y)))))

#define _v128_complement(x) \
({ \
__m128i _mm = _mm_undefined_si128(); \
_mm_storeu_si128((__m128i *)(x), \
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_cmpeq_epi32(_mm, _mm))); \
})

#if defined(__SSE4_1__)

#define _v128_is_eq(x, y) \
({ \
__m128i _mm = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(y))); \
_mm_testz_si128(_mm, _mm); \
})

#else /* defined(__SSE4_1__) */

#define _v128_is_eq(x, y) \
(_mm_movemask_epi8(_mm_cmpeq_epi32( \
_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(y)))) == 0x0000ffff)

#endif /* defined(__SSE4_1__) */

#define _v128_xor_eq(z, x) \
(_mm_storeu_si128((__m128i *)(z), \
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
_mm_loadu_si128((const __m128i *)(z)))))

#else /* defined(__SSE2__) */

#define _v128_set_to_zero(x) \
((x)->v32[0] = 0, (x)->v32[1] = 0, (x)->v32[2] = 0, (x)->v32[3] = 0)

Expand Down Expand Up @@ -179,6 +239,8 @@ void v128_right_shift(v128_t *x, int shift_index);
((z)->v64[0] ^= (x)->v64[0], (z)->v64[1] ^= (x)->v64[1])
#endif

#endif /* defined(__SSE2__) */

/* NOTE! This assumes an odd ordering! */
/* This will not be compatible directly with math on some processors */
/* bit 0 is first 32-bit word, low order bit. in little-endian, that's
Expand Down Expand Up @@ -278,13 +340,11 @@ void octet_string_set_to_zero(void *s, size_t len);
#define be64_to_cpu(x) bswap_64((x))
#else /* WORDS_BIGENDIAN */

#if defined(__GNUC__) && defined(HAVE_X86)
#if defined(__GNUC__)
/* Fall back. */
static inline uint32_t be32_to_cpu(uint32_t v)
{
/* optimized for x86. */
asm("bswap %0" : "=r"(v) : "0"(v));
return v;
return __builtin_bswap32(v);
}
#else /* HAVE_X86 */
#ifdef HAVE_NETINET_IN_H
Expand All @@ -297,7 +357,9 @@ static inline uint32_t be32_to_cpu(uint32_t v)

static inline uint64_t be64_to_cpu(uint64_t v)
{
#ifdef NO_64BIT_MATH
#if defined(__GNUC__)
v = __builtin_bswap64(v);
#elif defined(NO_64BIT_MATH)
/* use the make64 functions to do 64-bit math */
v = make64(htonl(low32(v)), htonl(high32(v)));
#else /* NO_64BIT_MATH */
Expand Down
Loading

0 comments on commit f7f31d5

Please sign in to comment.