From 32f6c7bdf552553f4aba971cf75dc6919859daef Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 5 May 2020 22:14:41 +0900 Subject: [PATCH 01/60] it works --- include/picotls/fusion.h | 50 ++++ lib/fusion.c | 426 ++++++++++++++++++++++++++++++ picotls.xcodeproj/project.pbxproj | 137 +++++++++- t/fusion.c | 113 ++++++++ 4 files changed, 724 insertions(+), 2 deletions(-) create mode 100644 include/picotls/fusion.h create mode 100644 lib/fusion.c create mode 100644 t/fusion.c diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h new file mode 100644 index 000000000..d82782ff2 --- /dev/null +++ b/include/picotls/fusion.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 Fastly, Kazuho Oku + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef picotls_fusion_h +#define picotls_fusion_h + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../picotls.h" + +#define PTLS_FUSION_AESGCM_ROUNDS 10 /* TODO support AES256 */ + +typedef struct ptls_fusion_aesgcm_context_t { + __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; + struct { + __m128i H; + __m128i r; + } ghash[6]; +} ptls_fusion_aesgcm_context_t; + +void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *key); +void ptls_fusion_aesgcm_dispose(ptls_fusion_aesgcm_context_t *ctx); + +extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/fusion.c b/lib/fusion.c new file mode 100644 index 000000000..d82a85a68 --- /dev/null +++ b/lib/fusion.c @@ -0,0 +1,426 @@ +/* + * This source file is licensed under the Apache License 2.0 *and* the MIT + * License. Please agree to *both* of the licensing terms! + * + * + * `transformH` function is a derivative work of OpenSSL. The original work + * is covered by the following license: + * + * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + * + * + * All other work, including modifications to the `transformH` function is + * covered by the following MIT license: + * + * Copyright (c) 2020 Fastly, Kazuho Oku + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include +#include +#include +#include "picotls.h" +#include "picotls/fusion.h" + +static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; +#define poly (*(__m128i *)poly_) +static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; +#define bswap8 (*(__m128i *)bswap8_) + +// This function is covered by the Apache License and the MIT License. See Above. +static __m128i transformH(__m128i H) +{ + // # <<1 twist + // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + __m128i t2 = _mm_shuffle_epi32(H, 0xff); + // movdqa $Hkey,$T1 + __m128i t1 = H; + // psllq \$1,$Hkey + H = _mm_slli_epi64(H, 1); + // pxor $T3,$T3 # + __m128i t3 = _mm_setzero_si128(); + // psrlq \$63,$T1 + t1 = _mm_srli_epi64(t1, 63); + // pcmpgtd $T2,$T3 # broadcast carry bit + t3 = _mm_cmplt_epi32(t2, t3); + // pslldq \$8,$T1 + t1 = _mm_slli_si128(t1, 8); + // por $T1,$Hkey # H<<=1 + H = _mm_or_si128(t1, H); + + // # magic reduction + // pand .L0x1c2_polynomial(%rip),$T3 + t3 = _mm_and_si128(t3, poly); + // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + H = _mm_xor_si128(t3, H); + + return H; +} +// end of Apache License code + +static __m128i gfmul(__m128i x, __m128i y) +{ + __m128i lo = _mm_clmulepi64_si128(x, y, 0x00); + __m128i hi = _mm_clmulepi64_si128(x, y, 0x11); + + __m128i a = _mm_shuffle_epi32(x, 78); + __m128i b = _mm_shuffle_epi32(y, 78); + a = _mm_xor_si128(a, x); + b = _mm_xor_si128(b, y); + + a = _mm_clmulepi64_si128(a, b, 0x00); + a = _mm_xor_si128(a, lo); + a = _mm_xor_si128(a, hi); + + b = _mm_slli_si128(a, 8); + a = _mm_srli_si128(a, 8); + + lo = _mm_xor_si128(lo, b); + hi = _mm_xor_si128(hi, a); + + // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf + __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, t); + + return _mm_xor_si128(hi, lo); +} + +#define AESECB6(b1, b2, b3, b4, b5, b6, b7, b8, b9) \ + do { \ + __m128i aesk = ctx->keys[0]; \ + __m128i aes1 = _mm_xor_si128(data[0], aesk); \ + __m128i aes2 = _mm_xor_si128(data[1], aesk); \ + __m128i aes3 = _mm_xor_si128(data[2], aesk); \ + __m128i aes4 = _mm_xor_si128(data[3], aesk); \ + __m128i aes5 = _mm_xor_si128(data[4], aesk); \ + __m128i aes6 = _mm_xor_si128(data[5], aesk); \ + aesk = ctx->keys[1]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b1} aesk = ctx->keys[2]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b2} aesk = ctx->keys[3]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b3} aesk = ctx->keys[4]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b4} aesk = ctx->keys[5]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b5} aesk = ctx->keys[6]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b6} aesk = ctx->keys[7]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b7} aesk = ctx->keys[8]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b8} aesk = ctx->keys[9]; \ + aes1 = _mm_aesenc_si128(aes1, aesk); \ + aes2 = _mm_aesenc_si128(aes2, aesk); \ + aes3 = _mm_aesenc_si128(aes3, aesk); \ + aes4 = _mm_aesenc_si128(aes4, aesk); \ + aes5 = _mm_aesenc_si128(aes5, aesk); \ + aes6 = _mm_aesenc_si128(aes6, aesk); \ + {b9} aesk = ctx->keys[10]; \ + data[0] = _mm_aesenclast_si128(aes1, aesk); \ + data[1] = _mm_aesenclast_si128(aes2, aesk); \ + data[2] = _mm_aesenclast_si128(aes3, aesk); \ + data[3] = _mm_aesenclast_si128(aes4, aesk); \ + data[4] = _mm_aesenclast_si128(aes5, aesk); \ + data[5] = _mm_aesenclast_si128(aes6, aesk); \ + } while (0) + +static inline void aesecb6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data) +{ + AESECB6({}, {}, {}, {}, {}, {}, {}, {}, {}); +} + +#define GHASH6(FUNC) \ + do { \ + __m128i X, lo, hi, mid, r, t; \ + FUNC( \ + { \ + X = _mm_loadu_si128(gdata + 5); \ + X = _mm_shuffle_epi8(X, bswap8); \ + lo = _mm_clmulepi64_si128(ctx->ghash[0].H, X, 0x00); \ + hi = _mm_clmulepi64_si128(ctx->ghash[0].H, X, 0x11); \ + mid = _mm_shuffle_epi32(X, 78); \ + mid = _mm_xor_si128(mid, X); \ + mid = _mm_clmulepi64_si128(ctx->ghash[0].r, mid, 0x00); \ + }, \ + { \ + X = _mm_loadu_si128(gdata + 4); \ + X = _mm_shuffle_epi8(X, bswap8); \ + t = _mm_clmulepi64_si128(ctx->ghash[1].H, X, 0x00); \ + lo = _mm_xor_si128(lo, t); \ + t = _mm_clmulepi64_si128(ctx->ghash[1].H, X, 0x11); \ + hi = _mm_xor_si128(hi, t); \ + t = _mm_shuffle_epi32(X, 78); \ + t = _mm_xor_si128(t, X); \ + t = _mm_clmulepi64_si128(ctx->ghash[1].r, t, 0x00); \ + mid = _mm_xor_si128(mid, t); \ + }, \ + { \ + X = _mm_loadu_si128(gdata + 3); \ + X = _mm_shuffle_epi8(X, bswap8); \ + t = _mm_clmulepi64_si128(ctx->ghash[2].H, X, 0x00); \ + lo = _mm_xor_si128(lo, t); \ + t = _mm_clmulepi64_si128(ctx->ghash[2].H, X, 0x11); \ + hi = _mm_xor_si128(hi, t); \ + t = _mm_shuffle_epi32(X, 78); \ + t = _mm_xor_si128(t, X); \ + t = _mm_clmulepi64_si128(ctx->ghash[2].r, t, 0x00); \ + mid = _mm_xor_si128(mid, t); \ + }, \ + { \ + X = _mm_loadu_si128(gdata + 2); \ + X = _mm_shuffle_epi8(X, bswap8); \ + t = _mm_clmulepi64_si128(ctx->ghash[3].H, X, 0x00); \ + lo = _mm_xor_si128(lo, t); \ + t = _mm_clmulepi64_si128(ctx->ghash[3].H, X, 0x11); \ + hi = _mm_xor_si128(hi, t); \ + t = _mm_shuffle_epi32(X, 78); \ + t = _mm_xor_si128(t, X); \ + t = _mm_clmulepi64_si128(ctx->ghash[3].r, t, 0x00); \ + mid = _mm_xor_si128(mid, t); \ + }, \ + { \ + X = _mm_loadu_si128(gdata + 1); \ + X = _mm_shuffle_epi8(X, bswap8); \ + t = _mm_clmulepi64_si128(ctx->ghash[4].H, X, 0x00); \ + lo = _mm_xor_si128(lo, t); \ + t = _mm_clmulepi64_si128(ctx->ghash[4].H, X, 0x11); \ + hi = _mm_xor_si128(hi, t); \ + t = _mm_shuffle_epi32(X, 78); \ + t = _mm_xor_si128(t, X); \ + t = _mm_clmulepi64_si128(ctx->ghash[4].r, t, 0x00); \ + mid = _mm_xor_si128(mid, t); \ + }, \ + { \ + X = _mm_loadu_si128(gdata + 0); \ + X = _mm_shuffle_epi8(X, bswap8); \ + X = _mm_xor_si128(X, ghash); \ + t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x00); \ + lo = _mm_xor_si128(lo, t); \ + t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x11); \ + }, \ + { \ + hi = _mm_xor_si128(hi, t); \ + t = _mm_shuffle_epi32(X, 78); \ + t = _mm_xor_si128(t, X); \ + t = _mm_clmulepi64_si128(ctx->ghash[5].r, t, 0x00); \ + mid = _mm_xor_si128(mid, t); \ + }, \ + { \ + mid = _mm_xor_si128(mid, hi); \ + mid = _mm_xor_si128(mid, lo); \ + lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); \ + hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); \ + \ + /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ \ + r = _mm_clmulepi64_si128(lo, poly, 0x10); \ + }, \ + { \ + lo = _mm_shuffle_epi32(lo, 78); \ + lo = _mm_xor_si128(lo, r); \ + r = _mm_clmulepi64_si128(lo, poly, 0x10); \ + lo = _mm_shuffle_epi32(lo, 78); \ + lo = _mm_xor_si128(lo, r); \ + ghash = _mm_xor_si128(hi, lo); \ + }); \ + return ghash; \ + } while (0) + +static inline __m128i ghash6(ptls_fusion_aesgcm_context_t *ctx, const __m128i *gdata, __m128i ghash) +{ +#define FUNC(b1, b2, b3, b4, b5, b6, b7, b8, b9) {b1} {b2} {b3} {b4} {b5} {b6} {b7} {b8} {b9} + GHASH6(FUNC); +#undef FUNC +} + +static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data, const __m128i *gdata, __m128i ghash) +{ + GHASH6(AESECB6); +} + +static __m128i expand_key(__m128i key, __m128i t) +{ + t = _mm_shuffle_epi32(t, _MM_SHUFFLE(3, 3, 3, 3)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + return _mm_xor_si128(key, t); +} + +void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *_userkey) +{ + __m128i userkey = _mm_loadu_si128((__m128i *)_userkey); + size_t i = 0; + + ctx->keys[i++] = userkey; +#define EXPAND(R) \ + do { \ + ctx->keys[i] = expand_key(ctx->keys[i - 1], _mm_aeskeygenassist_si128(ctx->keys[i - 1], R)); \ + ++i; \ + } while (0) + EXPAND(0x1); + EXPAND(0x2); + EXPAND(0x4); + EXPAND(0x8); + EXPAND(0x10); + EXPAND(0x20); + EXPAND(0x40); + EXPAND(0x80); + EXPAND(0x1b); + EXPAND(0x36); +#undef EXPAND + + ctx->ghash[0].H = ctx->keys[0]; + for (i = 1; i < PTLS_FUSION_AESGCM_ROUNDS; ++i) + ctx->ghash[0].H = _mm_aesenc_si128(ctx->ghash[0].H, ctx->keys[i]); + ctx->ghash[0].H = _mm_aesenclast_si128(ctx->ghash[0].H, ctx->keys[PTLS_FUSION_AESGCM_ROUNDS]); + ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); + + ctx->ghash[0].H = transformH(ctx->ghash[0].H); + for (int i = 1; i < 6; ++i) + ctx->ghash[i].H = gfmul(ctx->ghash[i - 1].H, ctx->ghash[0].H); + for (int i = 0; i < 6; ++i) { + __m128i r = _mm_shuffle_epi32(ctx->ghash[i].H, 78); + r = _mm_xor_si128(r, ctx->ghash[i].H); + ctx->ghash[i].r = r; + } +} + +void ptls_fusion_aesgcm_dispose(ptls_fusion_aesgcm_context_t *ctx) +{ + ptls_clear_memory(ctx, sizeof(*ctx)); +} + +struct aesgcm_context { + ptls_aead_context_t super; + ptls_fusion_aesgcm_context_t aesgcm; +}; + +static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) +{ + struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; + + ptls_fusion_aesgcm_dispose(&ctx->aesgcm); +} + +static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) +{ + assert(!"FIXME"); +} + +static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen) +{ + assert(!"FIXME"); + return SIZE_MAX; +} + +static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) +{ + assert(!"FIXME"); + return SIZE_MAX; +} + +static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen) +{ + assert(!"FIXME"); + return SIZE_MAX; +} + +static int aes128gcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key) +{ + struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; + + ctx->super.dispose_crypto = aesgcm_dispose_crypto; + if (is_enc) { + ctx->super.do_encrypt_init = aead_do_encrypt_init; + ctx->super.do_encrypt_update = aead_do_encrypt_update; + ctx->super.do_encrypt_final = aead_do_encrypt_final; + ctx->super.do_decrypt = NULL; + } else { + ctx->super.do_encrypt_init = NULL; + ctx->super.do_encrypt_update = NULL; + ctx->super.do_encrypt_final = NULL; + ctx->super.do_decrypt = aead_do_decrypt; + } + + assert(is_enc); + ptls_fusion_aesgcm_init(&ctx->aesgcm, key); + + return 0; +} + +ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", + NULL, // &ptls_fusion_aes128ctr, + NULL, // &ptls_fusion_aes128ecb, + PTLS_AES128_KEY_SIZE, + PTLS_AESGCM_IV_SIZE, + PTLS_AESGCM_TAG_SIZE, + sizeof(struct aesgcm_context), + aes128gcm_setup_crypto}; diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 4788870e1..62b303107 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -10,7 +10,6 @@ 105900431DC8D57000FB4085 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E91D9B7C13005B2C60 /* picotls.c */; }; 105900441DC8D57000FB4085 /* picotest.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E31D9B4021005B2C60 /* picotest.c */; }; 1059004C1DC8D5B700FB4085 /* openssl.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530C51D9B1A98005B2C60 /* openssl.c */; }; - 1059004E1DC8D61800FB4085 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; 105900501DC8D64E00FB4085 /* minicrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 1059004F1DC8D64E00FB4085 /* minicrypto.h */; }; 105900611DC8DF8C00FB4085 /* sha256.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059005F1DC8DE4400FB4085 /* sha256.c */; }; 105900641DC8DFA700FB4085 /* curve25519.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900391DC8D46A00FB4085 /* curve25519.c */; }; @@ -112,6 +111,34 @@ E99B75E31F5CE54D00CF503E /* asn1.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DE1F5CDDB500CF503E /* asn1.c */; }; E99B75E41F5CE64E00CF503E /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; E99B75E51F5CE64E00CF503E /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; + E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; + E9B43DC324619D5100824E51 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900241DC8D37500FB4085 /* aes.c */; }; + E9B43DC424619D5100824E51 /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; + E9B43DC524619D5100824E51 /* ffx.c in Sources */ = {isa = PBXBuildFile; fileRef = E97577022212405D00D1EF74 /* ffx.c */; }; + E9B43DC624619D5100824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E91D9B7C13005B2C60 /* picotls.c */; }; + E9B43DC724619D5100824E51 /* uECC.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900BC1DC96A3500FB4085 /* uECC.c */; }; + E9B43DC824619D5100824E51 /* picotest.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E31D9B4021005B2C60 /* picotest.c */; }; + E9B43DC924619D5100824E51 /* sha256.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059005F1DC8DE4400FB4085 /* sha256.c */; }; + E9B43DCA24619D5100824E51 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE422E34B340018D260 /* chacha20.c */; }; + E9B43DCB24619D5100824E51 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; + E9B43DCD24619D5100824E51 /* asn1.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DE1F5CDDB500CF503E /* asn1.c */; }; + E9B43DCE24619D5100824E51 /* sha512.c in Sources */ = {isa = PBXBuildFile; fileRef = E9E865E9203BD45600E2FFCD /* sha512.c */; }; + E9B43DCF24619D5100824E51 /* aes256.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE022E34B340018D260 /* aes256.c */; }; + E9B43DD024619D5100824E51 /* random.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BF922E34C110018D260 /* random.c */; }; + E9B43DD124619D5100824E51 /* aes128.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE222E34B340018D260 /* aes128.c */; }; + E9B43DD224619D5100824E51 /* drbg.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900511DC8D79300FB4085 /* drbg.c */; }; + E9B43DD324619D5100824E51 /* x25519.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE122E34B340018D260 /* x25519.c */; }; + E9B43DD424619D5100824E51 /* poly1305.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76D61EF3C1C200EB7A09 /* poly1305.c */; }; + E9B43DD524619D5100824E51 /* hmac.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900651DC8DFD300FB4085 /* hmac.c */; }; + E9B43DD624619D5100824E51 /* chash.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059006B1DC8E00400FB4085 /* chash.c */; }; + E9B43DD724619D5100824E51 /* gf128.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900AA1DC941D700FB4085 /* gf128.c */; }; + E9B43DD824619D5100824E51 /* curve25519.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900391DC8D46A00FB4085 /* curve25519.c */; }; + E9B43DD924619D5100824E51 /* gcm.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900A91DC941D700FB4085 /* gcm.c */; }; + E9B43DDA24619D5100824E51 /* modes.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900AF1DC9438200FB4085 /* modes.c */; }; + E9B43DDB24619D5100824E51 /* blockwise.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900291DC8D39800FB4085 /* blockwise.c */; }; + E9B43DE324619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; + E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; + E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; E9BC76CF1EF3A35E00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D21EF3A36A00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D41EF3A37200EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; @@ -205,6 +232,15 @@ ); runOnlyForDeploymentPostprocessing = 1; }; + E9B43DDD24619D5100824E51 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + ); + runOnlyForDeploymentPostprocessing = 1; + }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ @@ -269,6 +305,10 @@ E992F7A920E99A7C0008154D /* picotls-esni */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "picotls-esni"; sourceTree = BUILT_PRODUCTS_DIR; }; E99B75DE1F5CDDB500CF503E /* asn1.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = asn1.c; sourceTree = ""; }; E99B75DF1F5CDDB500CF503E /* pembase64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = pembase64.c; sourceTree = ""; }; + E9B43DBF24619D1700824E51 /* fusion.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = fusion.c; sourceTree = ""; }; + E9B43DE124619D5100824E51 /* test-fusion */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "test-fusion"; sourceTree = BUILT_PRODUCTS_DIR; }; + E9B43DE224619D7E00824E51 /* fusion.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = fusion.c; sourceTree = ""; }; + E9B43DE62461A06800824E51 /* fusion.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = fusion.h; sourceTree = ""; }; E9BC76C61EF3A2F700EB7A09 /* chacha20.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = chacha20.c; path = src/chacha20.c; sourceTree = ""; }; E9BC76CC1EF3A31000EB7A09 /* salsa20.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = salsa20.h; path = src/salsa20.h; sourceTree = ""; }; E9BC76D61EF3C1C200EB7A09 /* poly1305.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = poly1305.c; path = src/poly1305.c; sourceTree = ""; }; @@ -340,6 +380,13 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + E9B43DDC24619D5100824E51 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ @@ -416,6 +463,7 @@ 1059008C1DC8E1A300FB4085 /* libpicotls-openssl.a */, 10EACB171DCEAF0F00CA0341 /* libpicotls-minicrypto.a */, E992F7A920E99A7C0008154D /* picotls-esni */, + E9B43DE124619D5100824E51 /* test-fusion */, ); name = Products; sourceTree = ""; @@ -434,6 +482,7 @@ children = ( E99B75DE1F5CDDB500CF503E /* asn1.c */, E97577022212405D00D1EF74 /* ffx.c */, + E9B43DBF24619D1700824E51 /* fusion.c */, E99B75DF1F5CDDB500CF503E /* pembase64.c */, E9F20BDF22E34B210018D260 /* cifra */, 1059003F1DC8D53200FB4085 /* cifra.c */, @@ -451,6 +500,7 @@ children = ( 106530FE1DAD8A3C005B2C60 /* cli.c */, E97577072213148800D1EF74 /* e2e.t */, + E9B43DE224619D7E00824E51 /* fusion.c */, 106530E91D9B7C13005B2C60 /* picotls.c */, 1059003D1DC8D4E300FB4085 /* minicrypto.c */, 106530C51D9B1A98005B2C60 /* openssl.c */, @@ -486,6 +536,7 @@ 1059004F1DC8D64E00FB4085 /* minicrypto.h */, 106530ED1D9CEFF7005B2C60 /* openssl.h */, E9E4B1292180514000514B47 /* certificate_compression.h */, + E9B43DE62461A06800824E51 /* fusion.h */, ); path = picotls; sourceTree = ""; @@ -726,6 +777,23 @@ productReference = E992F7A920E99A7C0008154D /* picotls-esni */; productType = "com.apple.product-type.tool"; }; + E9B43DC024619D5100824E51 /* test-fusion */ = { + isa = PBXNativeTarget; + buildConfigurationList = E9B43DDE24619D5100824E51 /* Build configuration list for PBXNativeTarget "test-fusion" */; + buildPhases = ( + E9B43DC124619D5100824E51 /* Sources */, + E9B43DDC24619D5100824E51 /* Frameworks */, + E9B43DDD24619D5100824E51 /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = "test-fusion"; + productName = "test-crypto-openssl"; + productReference = E9B43DE124619D5100824E51 /* test-fusion */; + productType = "com.apple.product-type.tool"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ @@ -748,6 +816,7 @@ developmentRegion = English; hasScannedForEncodings = 0; knownRegions = ( + English, en, ); mainGroup = 106530A91D9985E0005B2C60; @@ -762,6 +831,7 @@ 106530CB1D9B3D45005B2C60 /* test-openssl */, 105900411DC8D57000FB4085 /* test-minicrypto */, E992F79B20E99A7C0008154D /* picotls-esni */, + E9B43DC024619D5100824E51 /* test-fusion */, ); }; /* End PBXProject section */ @@ -778,10 +848,10 @@ 105900431DC8D57000FB4085 /* picotls.c in Sources */, 105900C41DC96B2200FB4085 /* uECC.c in Sources */, 105900441DC8D57000FB4085 /* picotest.c in Sources */, + E9B43DE524619E1600824E51 /* minicrypto.c in Sources */, 105900611DC8DF8C00FB4085 /* sha256.c in Sources */, E9F20BF122E34B480018D260 /* chacha20.c in Sources */, E9BC76D21EF3A36A00EB7A09 /* chacha20.c in Sources */, - 1059004E1DC8D61800FB4085 /* minicrypto.c in Sources */, E99B75E31F5CE54D00CF503E /* asn1.c in Sources */, E9E865ED203BD46700E2FFCD /* sha512.c in Sources */, E9F20BF022E34B480018D260 /* aes256.c in Sources */, @@ -847,6 +917,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + E9B43DE324619D7E00824E51 /* fusion.c in Sources */, E99B75E01F5CDDB500CF503E /* asn1.c in Sources */, E99B75E11F5CDDB500CF503E /* pembase64.c in Sources */, E95E95382290456B00215ACD /* picotls-probes.d in Sources */, @@ -901,6 +972,39 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + E9B43DC124619D5100824E51 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */, + E9B43DC324619D5100824E51 /* aes.c in Sources */, + E9B43DC424619D5100824E51 /* pembase64.c in Sources */, + E9B43DE424619D7E00824E51 /* fusion.c in Sources */, + E9B43DC524619D5100824E51 /* ffx.c in Sources */, + E9B43DC624619D5100824E51 /* picotls.c in Sources */, + E9B43DC724619D5100824E51 /* uECC.c in Sources */, + E9B43DC824619D5100824E51 /* picotest.c in Sources */, + E9B43DC924619D5100824E51 /* sha256.c in Sources */, + E9B43DCA24619D5100824E51 /* chacha20.c in Sources */, + E9B43DCB24619D5100824E51 /* chacha20.c in Sources */, + E9B43DCD24619D5100824E51 /* asn1.c in Sources */, + E9B43DCE24619D5100824E51 /* sha512.c in Sources */, + E9B43DCF24619D5100824E51 /* aes256.c in Sources */, + E9B43DD024619D5100824E51 /* random.c in Sources */, + E9B43DD124619D5100824E51 /* aes128.c in Sources */, + E9B43DD224619D5100824E51 /* drbg.c in Sources */, + E9B43DD324619D5100824E51 /* x25519.c in Sources */, + E9B43DD424619D5100824E51 /* poly1305.c in Sources */, + E9B43DD524619D5100824E51 /* hmac.c in Sources */, + E9B43DD624619D5100824E51 /* chash.c in Sources */, + E9B43DD724619D5100824E51 /* gf128.c in Sources */, + E9B43DD824619D5100824E51 /* curve25519.c in Sources */, + E9B43DD924619D5100824E51 /* gcm.c in Sources */, + E9B43DDA24619D5100824E51 /* modes.c in Sources */, + E9B43DDB24619D5100824E51 /* blockwise.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXSourcesBuildPhase section */ /* Begin PBXTargetDependency section */ @@ -1214,6 +1318,26 @@ }; name = Release; }; + E9B43DDF24619D5100824E51 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; + OTHER_CFLAGS = "-march=native"; + OTHER_LDFLAGS = ""; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Debug; + }; + E9B43DE024619D5100824E51 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; + OTHER_CFLAGS = "-march=native"; + OTHER_LDFLAGS = ""; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ @@ -1289,6 +1413,15 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; + E9B43DDE24619D5100824E51 /* Build configuration list for PBXNativeTarget "test-fusion" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E9B43DDF24619D5100824E51 /* Debug */, + E9B43DE024619D5100824E51 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ }; rootObject = 106530AA1D9985E0005B2C60 /* Project object */; diff --git a/t/fusion.c b/t/fusion.c new file mode 100644 index 000000000..d249cf278 --- /dev/null +++ b/t/fusion.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2020 Fastly, Kazuho Oku + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include +#include +#include +#include "../deps/picotest/picotest.h" +#include "../lib/fusion.c" + +static void dump(const void *_p, size_t len) +{ + const uint8_t *p = _p; + for (size_t i = 0; i != len; ++i) + printf("%02x", p[i]); + printf("\n"); +} + +int main(int argc, char **argv) +{ + static const uint8_t userkey[16] = {}; + static const uint8_t plaintext[16] = {}; + __m128i ONE = _mm_set_epi32(0, 0, 0, 1), BSWAP64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + ptls_fusion_aesgcm_context_t ctx; + + ptls_fusion_aesgcm_init(&ctx, userkey); + + if (1) { /* test */ + __m128i ecb6[6], ctr = _mm_setzero_si128(); + for (int i = 0; i < 6; ++i) { + ctr = _mm_add_epi64(ctr, ONE); + ecb6[i] = _mm_shuffle_epi8(ctr, bswap8); + } + aesecb6(&ctx, ecb6); + __m128i gdata[6]; + for (int i = 0; i < 5; ++i) + gdata[i] = ecb6[i + 1]; + gdata[5] = _mm_shuffle_epi8(_mm_set_epi32(0, 8 * 16 * 5, 0, 0), BSWAP64); + __m128i dummy[6] = {}, ghash = {}; + ghash = aesecb6ghash6(&ctx, dummy, gdata, ghash); + ghash = _mm_shuffle_epi8(ghash, bswap8); + __m128i tag = _mm_xor_si128(ghash, ecb6[0]); + dump(ecb6 + 1, 16); + dump(ecb6 + 1, 16 * 5); + dump(&tag, 16); + + { + __m128i gx = {}; + gx = ghash6(&ctx, gdata, gx); + gx = _mm_shuffle_epi8(gx, bswap8); + tag = _mm_xor_si128(gx, ecb6[0]); + dump(&tag, 16); + } + } + + { /* benchmark */ + __m128i test[300] = {}, ghash = {}; + __m128i ctr = _mm_setzero_si128(); + for (int i = 0; i < 300; ++i) + memcpy(test + i, plaintext, 16); + for (int i = 0; i < 5000000; ++i) { + for (int j = 0; j < 300;) { + __m128i bits[6]; + ctr = _mm_add_epi64(ctr, ONE); + bits[0] = _mm_shuffle_epi8(ctr, BSWAP64); + ctr = _mm_add_epi64(ctr, ONE); + bits[1] = _mm_shuffle_epi8(ctr, BSWAP64); + ctr = _mm_add_epi64(ctr, ONE); + bits[2] = _mm_shuffle_epi8(ctr, BSWAP64); + ctr = _mm_add_epi64(ctr, ONE); + bits[3] = _mm_shuffle_epi8(ctr, BSWAP64); + ctr = _mm_add_epi64(ctr, ONE); + bits[4] = _mm_shuffle_epi8(ctr, BSWAP64); + ctr = _mm_add_epi64(ctr, ONE); + bits[5] = _mm_shuffle_epi8(ctr, BSWAP64); + ghash = aesecb6ghash6(&ctx, bits, j == 0 ? test + 294 : test + j - 6, ghash); + // aesecb4(&ctx, bits); + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[0])); + ++j; + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[1])); + ++j; + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[2])); + ++j; + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); + ++j; + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); + ++j; + _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); + ++j; + } + } + dump(&ghash, sizeof(ghash)); + } + + return 0; +} From fa13ede8ba326dbc150d7845f0e741c9de6a277d Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Wed, 6 May 2020 16:19:57 +0900 Subject: [PATCH 02/60] it works --- include/picotls/fusion.h | 4 + lib/fusion.c | 214 +++++++++++++++++++++++++++++++++++++++ t/fusion.c | 51 +++++++++- 3 files changed, 268 insertions(+), 1 deletion(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index d82782ff2..6df03ccc7 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -26,6 +26,8 @@ extern "C" { #endif +#include +#include #include "../picotls.h" #define PTLS_FUSION_AESGCM_ROUNDS 10 /* TODO support AES256 */ @@ -40,6 +42,8 @@ typedef struct ptls_fusion_aesgcm_context_t { void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *key); void ptls_fusion_aesgcm_dispose(ptls_fusion_aesgcm_context_t *ctx); +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, + const void *_src, size_t srclen); extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; diff --git a/lib/fusion.c b/lib/fusion.c index d82a85a68..31d8339d2 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -38,6 +38,7 @@ * IN THE SOFTWARE. */ #include +#include #include #include #include "picotls.h" @@ -299,11 +300,224 @@ static inline __m128i ghash6(ptls_fusion_aesgcm_context_t *ctx, const __m128i *g #undef FUNC } +static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, size_t cnt, __m128i ghash) +{ + __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(); + assert(cnt <= 6); + + for (size_t i = 0; i < cnt; ++i) { + __m128i X = _mm_loadu_si128(src + cnt - 1 - i); + X = _mm_shuffle_epi8(X, bswap8); + if (i == cnt - 1) + X = _mm_xor_si128(X, ghash); + __m128i t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x00); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x11); + hi = _mm_xor_si128(hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00); + mid = _mm_xor_si128(mid, t); + } + + mid = _mm_xor_si128(mid, hi); + mid = _mm_xor_si128(mid, lo); + lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); + hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); + + /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ + __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + r = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + ghash = _mm_xor_si128(hi, lo); + + return ghash; +} + static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data, const __m128i *gdata, __m128i ghash) { GHASH6(AESECB6); } +static inline __m128i loadn(const void *_p, size_t l) +{ + const uint8_t *p = _p; + uint8_t buf[16] = {}; + + for (size_t i = 0; i != l; ++i) + buf[i] = p[i]; + return *(__m128i *)buf; +} + +static inline void storen(void *_p, size_t l, __m128i v) +{ + uint8_t buf[16], *p = _p; + + *(__m128i *)buf = v; + + for (size_t i = 0; i != l; ++i) + p[i] = buf[i]; +} + +static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, const __m128i *dst_ghash, const __m128i *aad, size_t aadlen, + __m128i ghash, __m128i ac, __m128i ek0) +{ + const __m128i *enc = dst_ghash; + size_t enclen = (const uint8_t *)dst - (const uint8_t *)enc; + __m128i gdata[6]; + int gdata_index; + + while (1) { + gdata_index = 0; + if (aadlen != 0) { + while (aadlen >= 16) { + gdata[gdata_index++] = *aad++; + aadlen -= 16; + if (gdata_index == 6) + goto GHASH6; + } + if (aadlen != 0) { + gdata[gdata_index++] = loadn(aad, aadlen); + aadlen = 0; + if (gdata_index == 6) + goto GHASH6; + } + } + if (enclen != 0) { + while (enclen >= 16) { + gdata[gdata_index++] = *enc++; + enclen -= 16; + if (gdata_index == 6) + goto GHASH6; + } + if (enclen != 0) { + gdata[gdata_index++] = loadn(enc, enclen); + enclen = 0; + if (gdata_index == 6) + goto GHASH6; + } + } + __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + gdata[gdata_index++] = _mm_shuffle_epi8(ac, bswap64); + break; + + GHASH6: + ghash = ghash6(ctx, gdata, ghash); + } + + /* final */ +#if 0 + for (int i = 0; i < gdata_index; ++i) + ghash = gfmul(_mm_xor_si128(ghash, _mm_shuffle_epi8(gdata[i], bswap8)), ctx->ghash[0].H); +#else + ghash = ghashn(ctx, gdata, gdata_index, ghash); +#endif + __m128i tag = _mm_shuffle_epi8(ghash, bswap8); + tag = _mm_xor_si128(tag, ek0); + _mm_storeu_si128(dst, tag); +} + +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, + const void *_src, size_t srclen) +{ + __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7), one = _mm_set_epi32(0, 1, 0, 0); + __m128i ctr, ek0, bits[6], gdatabuf[6], ghash = _mm_setzero_si128(); + int ek0_encrypted = 0; + __m128i ac = _mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8); + + // src and dst are updated after the chunk is processed + const __m128i *src = _src; + __m128i *dst = _dst; + // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor) + const __m128i *aad = _aad, *dst_ghash = dst; + + /* build counter */ + ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); + ctr = _mm_shuffle_epi8(ctr, bswap8); + ctr = _mm_add_epi64(ctr, one); + ek0 = _mm_shuffle_epi8(ctr, bswap64); + +/* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */ +#define SETUP_BITS() \ + do { \ + for (int i = 0; i < 5; ++i) { \ + ctr = _mm_add_epi64(ctr, one); \ + bits[i] = _mm_shuffle_epi8(ctr, bswap64); \ + } \ + if (PTLS_LIKELY(srclen > 16 * 5)) { \ + ctr = _mm_add_epi64(ctr, one); \ + bits[5] = _mm_shuffle_epi8(ctr, bswap64); \ + } else { \ + assert(!ek0_encrypted); \ + bits[5] = ek0; \ + ek0_encrypted = 1; \ + } \ + } while (0) + + /* build the first AES bits */ + SETUP_BITS(); + aesecb6(ctx, bits); + + /* the main loop */ + while (PTLS_LIKELY(srclen >= 6 * 16)) { + /* apply the bits */ + for (int i = 0; i < 6; ++i) + _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i])); + srclen -= 6 * 16; + + /* setup bits */ + SETUP_BITS(); + + /* setup gdata */ + const __m128i *gdata; + if (PTLS_UNLIKELY(aadlen != 0)) { + for (int i = 0; i < 6; ++i) { + if (aadlen < 16) { + if (aadlen != 0) { + gdatabuf[i++] = loadn(aad, aadlen); + aadlen = 0; + } + while (i < 6) + gdatabuf[i++] = *dst_ghash++; + break; + } + gdatabuf[i++] = _mm_loadu_si128(aad++); + aadlen -= 16; + } + gdata = gdatabuf; + } else { + gdata = dst_ghash; + dst_ghash += 6; + } + + /* doit */ + ghash = aesecb6ghash6(ctx, bits, gdata, ghash); + } + + /* apply the bit stream to the remainder */ + for (int i = 0; i < 6 && srclen != 0; ++i) { + if (srclen < 16) { + storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits[i])); + dst = (__m128i *)((uint8_t *)dst + srclen); + srclen = 0; + break; + } + _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i])); + srclen -= 16; + } + + if (ek0_encrypted) { + ek0 = bits[5]; + } else { + assert(!"FIXME calculate ek0"); + } + + finish_gcm(ctx, dst, dst_ghash, aad, aadlen, ghash, ac, ek0); +} + static __m128i expand_key(__m128i key, __m128i t) { t = _mm_shuffle_epi32(t, _MM_SHUFFLE(3, 3, 3, 3)); diff --git a/t/fusion.c b/t/fusion.c index d249cf278..c50bdc185 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -28,8 +28,11 @@ static void dump(const void *_p, size_t len) { const uint8_t *p = _p; - for (size_t i = 0; i != len; ++i) + for (size_t i = 0; i != len; ++i) { + if (i % 16 == 0 && i != 0) + printf("-"); printf("%02x", p[i]); + } printf("\n"); } @@ -42,6 +45,13 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_init(&ctx, userkey); + { + static const uint8_t iv[12] = {}; + uint8_t encrypted[sizeof(plaintext) + 16]; + ptls_fusion_aesgcm_encrypt(&ctx, iv, "hello", 5, encrypted, plaintext, sizeof(plaintext)); + dump(encrypted, sizeof(encrypted)); + } + if (1) { /* test */ __m128i ecb6[6], ctr = _mm_setzero_si128(); for (int i = 0; i < 6; ++i) { @@ -61,6 +71,14 @@ int main(int argc, char **argv) dump(ecb6 + 1, 16 * 5); dump(&tag, 16); + { + __m128i gx = {}, input[2] = {ecb6[1], _mm_shuffle_epi8(_mm_set_epi32(0, 8 * 16, 0, 0), BSWAP64)}; + gx = ghashn(&ctx, input, 2, gx); + gx = _mm_shuffle_epi8(gx, bswap8); + tag = _mm_xor_si128(gx, ecb6[0]); + dump(&tag, 16); + } + { __m128i gx = {}; gx = ghash6(&ctx, gdata, gx); @@ -68,8 +86,38 @@ int main(int argc, char **argv) tag = _mm_xor_si128(gx, ecb6[0]); dump(&tag, 16); } + { + __m128i gx = {}; + gx = ghashn(&ctx, gdata, 6, gx); + gx = _mm_shuffle_epi8(gx, bswap8); + tag = _mm_xor_si128(gx, ecb6[0]); + dump(&tag, 16); + } + + { + __m128i gx = {}; + gx = gfmul(ctx.ghash[0].H, _mm_shuffle_epi8(ecb6[0], bswap8)); + gx = gfmul(ctx.ghash[0].H, _mm_xor_si128(gx, _mm_shuffle_epi8(ecb6[1], bswap8))); + dump(&gx, 16); + } + { + __m128i gx = {}; + gx = ghashn(&ctx, ecb6, 2, gx); + dump(&gx, 16); + } } +#if 1 + { /* benchmark */ + static const uint8_t iv[12] = {}, aad[13] = {}, text[16384] = {}; + uint8_t encrypted[sizeof(text) + 16]; + for (int i = 0; i < 1000000; ++i) { + ptls_fusion_aesgcm_encrypt(&ctx, iv, aad, sizeof(aad), encrypted, text, sizeof(text)); + if (i == 0) + dump(encrypted + sizeof(text), 16); + } + } +#else { /* benchmark */ __m128i test[300] = {}, ghash = {}; __m128i ctr = _mm_setzero_si128(); @@ -108,6 +156,7 @@ int main(int argc, char **argv) } dump(&ghash, sizeof(ghash)); } +#endif return 0; } From 58f04f47234f8a021a37d16961dc0019dc150f4a Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Wed, 6 May 2020 17:18:49 +0900 Subject: [PATCH 03/60] unaligned access --- lib/fusion.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 31d8339d2..24d8eef90 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -374,7 +374,7 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c gdata_index = 0; if (aadlen != 0) { while (aadlen >= 16) { - gdata[gdata_index++] = *aad++; + gdata[gdata_index++] = _mm_loadu_si128(aad++); aadlen -= 16; if (gdata_index == 6) goto GHASH6; @@ -388,7 +388,7 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c } if (enclen != 0) { while (enclen >= 16) { - gdata[gdata_index++] = *enc++; + gdata[gdata_index++] = _mm_loadu_si128(enc++); enclen -= 16; if (gdata_index == 6) goto GHASH6; @@ -481,7 +481,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i aadlen = 0; } while (i < 6) - gdatabuf[i++] = *dst_ghash++; + gdatabuf[i++] = _mm_loadu_si128(dst_ghash++); break; } gdatabuf[i++] = _mm_loadu_si128(aad++); From 50b356896ca9d7a21c21ffa862611efeddd2d1e2 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Wed, 6 May 2020 20:49:25 +0900 Subject: [PATCH 04/60] clang-format --- lib/fusion.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 24d8eef90..07d50e1c8 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -362,8 +362,8 @@ static inline void storen(void *_p, size_t l, __m128i v) p[i] = buf[i]; } -static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, const __m128i *dst_ghash, const __m128i *aad, size_t aadlen, - __m128i ghash, __m128i ac, __m128i ek0) +static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, const __m128i *dst_ghash, const __m128i *aad, + size_t aadlen, __m128i ghash, __m128i ac, __m128i ek0) { const __m128i *enc = dst_ghash; size_t enclen = (const uint8_t *)dst - (const uint8_t *)enc; From ac9f2d0859a97652f76f9051ac36256adfc145fd Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 05:40:52 +0900 Subject: [PATCH 05/60] remove dead code --- lib/fusion.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 07d50e1c8..bc93f1422 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -409,12 +409,7 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c } /* final */ -#if 0 - for (int i = 0; i < gdata_index; ++i) - ghash = gfmul(_mm_xor_si128(ghash, _mm_shuffle_epi8(gdata[i], bswap8)), ctx->ghash[0].H); -#else ghash = ghashn(ctx, gdata, gdata_index, ghash); -#endif __m128i tag = _mm_shuffle_epi8(ghash, bswap8); tag = _mm_xor_si128(tag, ek0); _mm_storeu_si128(dst, tag); From 7936cddd953d14777d0b28b6097661620e8dabc6 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 06:36:47 +0900 Subject: [PATCH 06/60] constantify --- lib/fusion.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index bc93f1422..cfa6a71c7 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -48,6 +48,10 @@ static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc20000000000 #define poly (*(__m128i *)poly_) static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; #define bswap8 (*(__m128i *)bswap8_) +static const uint8_t bswap64_[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; +#define bswap64 (*(__m128i *)bswap64_) +static const uint8_t one64_[16] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0, 1}; +#define one64 (*(__m128i *)one64_) // This function is covered by the Apache License and the MIT License. See Above. static __m128i transformH(__m128i H) @@ -400,7 +404,6 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c goto GHASH6; } } - __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); gdata[gdata_index++] = _mm_shuffle_epi8(ac, bswap64); break; @@ -418,7 +421,6 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, const void *_src, size_t srclen) { - __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7), one = _mm_set_epi32(0, 1, 0, 0); __m128i ctr, ek0, bits[6], gdatabuf[6], ghash = _mm_setzero_si128(); int ek0_encrypted = 0; __m128i ac = _mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8); @@ -432,18 +434,18 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* build counter */ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); ctr = _mm_shuffle_epi8(ctr, bswap8); - ctr = _mm_add_epi64(ctr, one); + ctr = _mm_add_epi64(ctr, one64); ek0 = _mm_shuffle_epi8(ctr, bswap64); /* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */ #define SETUP_BITS() \ do { \ for (int i = 0; i < 5; ++i) { \ - ctr = _mm_add_epi64(ctr, one); \ + ctr = _mm_add_epi64(ctr, one64); \ bits[i] = _mm_shuffle_epi8(ctr, bswap64); \ } \ if (PTLS_LIKELY(srclen > 16 * 5)) { \ - ctr = _mm_add_epi64(ctr, one); \ + ctr = _mm_add_epi64(ctr, one64); \ bits[5] = _mm_shuffle_epi8(ctr, bswap64); \ } else { \ assert(!ek0_encrypted); \ From 2842536ed7efda7d2338b9dd72912dd690e04a44 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 11:23:48 +0900 Subject: [PATCH 07/60] do ~ 16384 bytes in thes test code too --- t/fusion.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/t/fusion.c b/t/fusion.c index c50bdc185..21009808b 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -118,13 +118,13 @@ int main(int argc, char **argv) } } #else - { /* benchmark */ - __m128i test[300] = {}, ghash = {}; + { /* benchmark (using ~16384 bytes block) */ + __m128i test[171 * 6] = {}, ghash = {}; __m128i ctr = _mm_setzero_si128(); - for (int i = 0; i < 300; ++i) + for (int i = 0; i < 171 * 6; ++i) memcpy(test + i, plaintext, 16); - for (int i = 0; i < 5000000; ++i) { - for (int j = 0; j < 300;) { + for (int i = 0; i < 1000000; ++i) { + for (int j = 0; j < 171;) { __m128i bits[6]; ctr = _mm_add_epi64(ctr, ONE); bits[0] = _mm_shuffle_epi8(ctr, BSWAP64); @@ -138,7 +138,7 @@ int main(int argc, char **argv) bits[4] = _mm_shuffle_epi8(ctr, BSWAP64); ctr = _mm_add_epi64(ctr, ONE); bits[5] = _mm_shuffle_epi8(ctr, BSWAP64); - ghash = aesecb6ghash6(&ctx, bits, j == 0 ? test + 294 : test + j - 6, ghash); + ghash = aesecb6ghash6(&ctx, bits, j == 0 ? test + 171 * 6 - 6 : test + j - 6, ghash); // aesecb4(&ctx, bits); _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[0])); ++j; From 0a1dc47ff19865d0578b2f65a50f49322a28ae46 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 12:53:58 +0900 Subject: [PATCH 08/60] use loop to optimize for size --- lib/fusion.c | 280 ++++++++++++++++++--------------------------------- t/fusion.c | 7 -- 2 files changed, 97 insertions(+), 190 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index cfa6a71c7..5f5b527eb 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -115,193 +115,48 @@ static __m128i gfmul(__m128i x, __m128i y) return _mm_xor_si128(hi, lo); } -#define AESECB6(b1, b2, b3, b4, b5, b6, b7, b8, b9) \ - do { \ - __m128i aesk = ctx->keys[0]; \ - __m128i aes1 = _mm_xor_si128(data[0], aesk); \ - __m128i aes2 = _mm_xor_si128(data[1], aesk); \ - __m128i aes3 = _mm_xor_si128(data[2], aesk); \ - __m128i aes4 = _mm_xor_si128(data[3], aesk); \ - __m128i aes5 = _mm_xor_si128(data[4], aesk); \ - __m128i aes6 = _mm_xor_si128(data[5], aesk); \ - aesk = ctx->keys[1]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b1} aesk = ctx->keys[2]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b2} aesk = ctx->keys[3]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b3} aesk = ctx->keys[4]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b4} aesk = ctx->keys[5]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b5} aesk = ctx->keys[6]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b6} aesk = ctx->keys[7]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b7} aesk = ctx->keys[8]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b8} aesk = ctx->keys[9]; \ - aes1 = _mm_aesenc_si128(aes1, aesk); \ - aes2 = _mm_aesenc_si128(aes2, aesk); \ - aes3 = _mm_aesenc_si128(aes3, aesk); \ - aes4 = _mm_aesenc_si128(aes4, aesk); \ - aes5 = _mm_aesenc_si128(aes5, aesk); \ - aes6 = _mm_aesenc_si128(aes6, aesk); \ - {b9} aesk = ctx->keys[10]; \ - data[0] = _mm_aesenclast_si128(aes1, aesk); \ - data[1] = _mm_aesenclast_si128(aes2, aesk); \ - data[2] = _mm_aesenclast_si128(aes3, aesk); \ - data[3] = _mm_aesenclast_si128(aes4, aesk); \ - data[4] = _mm_aesenclast_si128(aes5, aesk); \ - data[5] = _mm_aesenclast_si128(aes6, aesk); \ +#define AESECB6_INIT() \ + __m128i aes0, aes1, aes2, aes3, aes4, aes5; \ + do { \ + __m128i k = ctx->keys[0]; \ + aes0 = _mm_xor_si128(data[0], k); \ + aes1 = _mm_xor_si128(data[1], k); \ + aes2 = _mm_xor_si128(data[2], k); \ + aes3 = _mm_xor_si128(data[3], k); \ + aes4 = _mm_xor_si128(data[4], k); \ + aes5 = _mm_xor_si128(data[5], k); \ } while (0) -static inline void aesecb6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data) -{ - AESECB6({}, {}, {}, {}, {}, {}, {}, {}, {}); -} +#define AESECB6_UPDATE(i) \ + do { \ + __m128i k = ctx->keys[i]; \ + aes0 = _mm_aesenc_si128(aes0, k); \ + aes1 = _mm_aesenc_si128(aes1, k); \ + aes2 = _mm_aesenc_si128(aes2, k); \ + aes3 = _mm_aesenc_si128(aes3, k); \ + aes4 = _mm_aesenc_si128(aes4, k); \ + aes5 = _mm_aesenc_si128(aes5, k); \ + } while (0) -#define GHASH6(FUNC) \ - do { \ - __m128i X, lo, hi, mid, r, t; \ - FUNC( \ - { \ - X = _mm_loadu_si128(gdata + 5); \ - X = _mm_shuffle_epi8(X, bswap8); \ - lo = _mm_clmulepi64_si128(ctx->ghash[0].H, X, 0x00); \ - hi = _mm_clmulepi64_si128(ctx->ghash[0].H, X, 0x11); \ - mid = _mm_shuffle_epi32(X, 78); \ - mid = _mm_xor_si128(mid, X); \ - mid = _mm_clmulepi64_si128(ctx->ghash[0].r, mid, 0x00); \ - }, \ - { \ - X = _mm_loadu_si128(gdata + 4); \ - X = _mm_shuffle_epi8(X, bswap8); \ - t = _mm_clmulepi64_si128(ctx->ghash[1].H, X, 0x00); \ - lo = _mm_xor_si128(lo, t); \ - t = _mm_clmulepi64_si128(ctx->ghash[1].H, X, 0x11); \ - hi = _mm_xor_si128(hi, t); \ - t = _mm_shuffle_epi32(X, 78); \ - t = _mm_xor_si128(t, X); \ - t = _mm_clmulepi64_si128(ctx->ghash[1].r, t, 0x00); \ - mid = _mm_xor_si128(mid, t); \ - }, \ - { \ - X = _mm_loadu_si128(gdata + 3); \ - X = _mm_shuffle_epi8(X, bswap8); \ - t = _mm_clmulepi64_si128(ctx->ghash[2].H, X, 0x00); \ - lo = _mm_xor_si128(lo, t); \ - t = _mm_clmulepi64_si128(ctx->ghash[2].H, X, 0x11); \ - hi = _mm_xor_si128(hi, t); \ - t = _mm_shuffle_epi32(X, 78); \ - t = _mm_xor_si128(t, X); \ - t = _mm_clmulepi64_si128(ctx->ghash[2].r, t, 0x00); \ - mid = _mm_xor_si128(mid, t); \ - }, \ - { \ - X = _mm_loadu_si128(gdata + 2); \ - X = _mm_shuffle_epi8(X, bswap8); \ - t = _mm_clmulepi64_si128(ctx->ghash[3].H, X, 0x00); \ - lo = _mm_xor_si128(lo, t); \ - t = _mm_clmulepi64_si128(ctx->ghash[3].H, X, 0x11); \ - hi = _mm_xor_si128(hi, t); \ - t = _mm_shuffle_epi32(X, 78); \ - t = _mm_xor_si128(t, X); \ - t = _mm_clmulepi64_si128(ctx->ghash[3].r, t, 0x00); \ - mid = _mm_xor_si128(mid, t); \ - }, \ - { \ - X = _mm_loadu_si128(gdata + 1); \ - X = _mm_shuffle_epi8(X, bswap8); \ - t = _mm_clmulepi64_si128(ctx->ghash[4].H, X, 0x00); \ - lo = _mm_xor_si128(lo, t); \ - t = _mm_clmulepi64_si128(ctx->ghash[4].H, X, 0x11); \ - hi = _mm_xor_si128(hi, t); \ - t = _mm_shuffle_epi32(X, 78); \ - t = _mm_xor_si128(t, X); \ - t = _mm_clmulepi64_si128(ctx->ghash[4].r, t, 0x00); \ - mid = _mm_xor_si128(mid, t); \ - }, \ - { \ - X = _mm_loadu_si128(gdata + 0); \ - X = _mm_shuffle_epi8(X, bswap8); \ - X = _mm_xor_si128(X, ghash); \ - t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x00); \ - lo = _mm_xor_si128(lo, t); \ - t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x11); \ - }, \ - { \ - hi = _mm_xor_si128(hi, t); \ - t = _mm_shuffle_epi32(X, 78); \ - t = _mm_xor_si128(t, X); \ - t = _mm_clmulepi64_si128(ctx->ghash[5].r, t, 0x00); \ - mid = _mm_xor_si128(mid, t); \ - }, \ - { \ - mid = _mm_xor_si128(mid, hi); \ - mid = _mm_xor_si128(mid, lo); \ - lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); \ - hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); \ - \ - /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ \ - r = _mm_clmulepi64_si128(lo, poly, 0x10); \ - }, \ - { \ - lo = _mm_shuffle_epi32(lo, 78); \ - lo = _mm_xor_si128(lo, r); \ - r = _mm_clmulepi64_si128(lo, poly, 0x10); \ - lo = _mm_shuffle_epi32(lo, 78); \ - lo = _mm_xor_si128(lo, r); \ - ghash = _mm_xor_si128(hi, lo); \ - }); \ - return ghash; \ +#define AESECB6_FINAL() \ + do { \ + __m128i k = ctx->keys[10]; \ + data[0] = _mm_aesenclast_si128(aes0, k); \ + data[1] = _mm_aesenclast_si128(aes1, k); \ + data[2] = _mm_aesenclast_si128(aes2, k); \ + data[3] = _mm_aesenclast_si128(aes3, k); \ + data[4] = _mm_aesenclast_si128(aes4, k); \ + data[5] = _mm_aesenclast_si128(aes5, k); \ } while (0) -static inline __m128i ghash6(ptls_fusion_aesgcm_context_t *ctx, const __m128i *gdata, __m128i ghash) +static inline void aesecb6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data) { -#define FUNC(b1, b2, b3, b4, b5, b6, b7, b8, b9) {b1} {b2} {b3} {b4} {b5} {b6} {b7} {b8} {b9} - GHASH6(FUNC); -#undef FUNC + AESECB6_INIT(); + + for (int i = 1; i < 10; ++i) + AESECB6_UPDATE(i); + + AESECB6_FINAL(); } static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, size_t cnt, __m128i ghash) @@ -343,7 +198,66 @@ static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, siz static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data, const __m128i *gdata, __m128i ghash) { - GHASH6(AESECB6); + __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), X, r, t; + + AESECB6_INIT(); + + for (size_t i = 0; i < 5; ++i) { + + AESECB6_UPDATE(i + 1); + + X = _mm_loadu_si128(gdata + 5 - i); + X = _mm_shuffle_epi8(X, bswap8); + t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x00); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x11); + hi = _mm_xor_si128(hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00); + mid = _mm_xor_si128(mid, t); + + } + + AESECB6_UPDATE(6); + + X = _mm_loadu_si128(gdata + 0); + X = _mm_shuffle_epi8(X, bswap8); + X = _mm_xor_si128(X, ghash); + t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x00); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x11); + + AESECB6_UPDATE(7); + + hi = _mm_xor_si128(hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(ctx->ghash[5].r, t, 0x00); + mid = _mm_xor_si128(mid, t); + + AESECB6_UPDATE(8); + + mid = _mm_xor_si128(mid, hi); + mid = _mm_xor_si128(mid, lo); + lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); + hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); + + /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ + r = _mm_clmulepi64_si128(lo, poly, 0x10); + + AESECB6_UPDATE(9); + + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + r = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + ghash = _mm_xor_si128(hi, lo); + + AESECB6_FINAL(); + + return ghash; } static inline __m128i loadn(const void *_p, size_t l) @@ -408,7 +322,7 @@ static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, c break; GHASH6: - ghash = ghash6(ctx, gdata, ghash); + ghash = ghashn(ctx, gdata, 6, ghash); } /* final */ diff --git a/t/fusion.c b/t/fusion.c index 21009808b..0c0387ae2 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -79,13 +79,6 @@ int main(int argc, char **argv) dump(&tag, 16); } - { - __m128i gx = {}; - gx = ghash6(&ctx, gdata, gx); - gx = _mm_shuffle_epi8(gx, bswap8); - tag = _mm_xor_si128(gx, ecb6[0]); - dump(&tag, 16); - } { __m128i gx = {}; gx = ghashn(&ctx, gdata, 6, gx); From 083f5315cf466af9644e3426c1ea8fa84e242584 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 13:05:10 +0900 Subject: [PATCH 09/60] unroll hot loops --- lib/fusion.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 5f5b527eb..39a64c805 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -353,11 +353,17 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */ #define SETUP_BITS() \ - do { \ - for (int i = 0; i < 5; ++i) { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[i] = _mm_shuffle_epi8(ctr, bswap64); \ - } \ + do { \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[0] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[1] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[2] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[3] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[4] = _mm_shuffle_epi8(ctr, bswap64); \ if (PTLS_LIKELY(srclen > 16 * 5)) { \ ctr = _mm_add_epi64(ctr, one64); \ bits[5] = _mm_shuffle_epi8(ctr, bswap64); \ @@ -375,8 +381,16 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* the main loop */ while (PTLS_LIKELY(srclen >= 6 * 16)) { /* apply the bits */ - for (int i = 0; i < 6; ++i) - _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i])); +#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits[i])) + APPLY(0); + APPLY(1); + APPLY(2); + APPLY(3); + APPLY(4); + APPLY(5); +#undef APPLY + dst += 6; + src += 6; srclen -= 6 * 16; /* setup bits */ From 2ef1c0f3b84298dd3c54542e419254f372efe638 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 13:05:46 +0900 Subject: [PATCH 10/60] clang-format --- lib/fusion.c | 79 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 39a64c805..c26b20c66 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -115,38 +115,38 @@ static __m128i gfmul(__m128i x, __m128i y) return _mm_xor_si128(hi, lo); } -#define AESECB6_INIT() \ - __m128i aes0, aes1, aes2, aes3, aes4, aes5; \ - do { \ - __m128i k = ctx->keys[0]; \ - aes0 = _mm_xor_si128(data[0], k); \ - aes1 = _mm_xor_si128(data[1], k); \ - aes2 = _mm_xor_si128(data[2], k); \ - aes3 = _mm_xor_si128(data[3], k); \ - aes4 = _mm_xor_si128(data[4], k); \ - aes5 = _mm_xor_si128(data[5], k); \ +#define AESECB6_INIT() \ + __m128i aes0, aes1, aes2, aes3, aes4, aes5; \ + do { \ + __m128i k = ctx->keys[0]; \ + aes0 = _mm_xor_si128(data[0], k); \ + aes1 = _mm_xor_si128(data[1], k); \ + aes2 = _mm_xor_si128(data[2], k); \ + aes3 = _mm_xor_si128(data[3], k); \ + aes4 = _mm_xor_si128(data[4], k); \ + aes5 = _mm_xor_si128(data[5], k); \ } while (0) -#define AESECB6_UPDATE(i) \ - do { \ - __m128i k = ctx->keys[i]; \ - aes0 = _mm_aesenc_si128(aes0, k); \ - aes1 = _mm_aesenc_si128(aes1, k); \ - aes2 = _mm_aesenc_si128(aes2, k); \ - aes3 = _mm_aesenc_si128(aes3, k); \ - aes4 = _mm_aesenc_si128(aes4, k); \ - aes5 = _mm_aesenc_si128(aes5, k); \ +#define AESECB6_UPDATE(i) \ + do { \ + __m128i k = ctx->keys[i]; \ + aes0 = _mm_aesenc_si128(aes0, k); \ + aes1 = _mm_aesenc_si128(aes1, k); \ + aes2 = _mm_aesenc_si128(aes2, k); \ + aes3 = _mm_aesenc_si128(aes3, k); \ + aes4 = _mm_aesenc_si128(aes4, k); \ + aes5 = _mm_aesenc_si128(aes5, k); \ } while (0) -#define AESECB6_FINAL() \ - do { \ - __m128i k = ctx->keys[10]; \ - data[0] = _mm_aesenclast_si128(aes0, k); \ - data[1] = _mm_aesenclast_si128(aes1, k); \ - data[2] = _mm_aesenclast_si128(aes2, k); \ - data[3] = _mm_aesenclast_si128(aes3, k); \ - data[4] = _mm_aesenclast_si128(aes4, k); \ - data[5] = _mm_aesenclast_si128(aes5, k); \ +#define AESECB6_FINAL() \ + do { \ + __m128i k = ctx->keys[10]; \ + data[0] = _mm_aesenclast_si128(aes0, k); \ + data[1] = _mm_aesenclast_si128(aes1, k); \ + data[2] = _mm_aesenclast_si128(aes2, k); \ + data[3] = _mm_aesenclast_si128(aes3, k); \ + data[4] = _mm_aesenclast_si128(aes4, k); \ + data[5] = _mm_aesenclast_si128(aes5, k); \ } while (0) static inline void aesecb6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data) @@ -216,7 +216,6 @@ static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i * t = _mm_xor_si128(t, X); t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00); mid = _mm_xor_si128(mid, t); - } AESECB6_UPDATE(6); @@ -353,17 +352,17 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */ #define SETUP_BITS() \ - do { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[0] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[1] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[2] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[3] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[4] = _mm_shuffle_epi8(ctr, bswap64); \ + do { \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[0] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[1] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[2] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[3] = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits[4] = _mm_shuffle_epi8(ctr, bswap64); \ if (PTLS_LIKELY(srclen > 16 * 5)) { \ ctr = _mm_add_epi64(ctr, one64); \ bits[5] = _mm_shuffle_epi8(ctr, bswap64); \ From 274a572cdc4804be8e820c48f443bfc259c8132a Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 7 May 2020 22:56:07 +0900 Subject: [PATCH 11/60] precompute the entire ghash table --- include/picotls/fusion.h | 4 +- lib/fusion.c | 419 ++++++++++++++++++--------------------- t/fusion.c | 51 +---- 3 files changed, 196 insertions(+), 278 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 6df03ccc7..b3ad1ad70 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -34,10 +34,10 @@ extern "C" { typedef struct ptls_fusion_aesgcm_context_t { __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; - struct { + struct ptls_fusion_aesgcm_ghash_precompute { __m128i H; __m128i r; - } ghash[6]; + } ghash[1050]; } ptls_fusion_aesgcm_context_t; void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *key); diff --git a/lib/fusion.c b/lib/fusion.c index c26b20c66..81f8ff14e 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -115,50 +115,6 @@ static __m128i gfmul(__m128i x, __m128i y) return _mm_xor_si128(hi, lo); } -#define AESECB6_INIT() \ - __m128i aes0, aes1, aes2, aes3, aes4, aes5; \ - do { \ - __m128i k = ctx->keys[0]; \ - aes0 = _mm_xor_si128(data[0], k); \ - aes1 = _mm_xor_si128(data[1], k); \ - aes2 = _mm_xor_si128(data[2], k); \ - aes3 = _mm_xor_si128(data[3], k); \ - aes4 = _mm_xor_si128(data[4], k); \ - aes5 = _mm_xor_si128(data[5], k); \ - } while (0) - -#define AESECB6_UPDATE(i) \ - do { \ - __m128i k = ctx->keys[i]; \ - aes0 = _mm_aesenc_si128(aes0, k); \ - aes1 = _mm_aesenc_si128(aes1, k); \ - aes2 = _mm_aesenc_si128(aes2, k); \ - aes3 = _mm_aesenc_si128(aes3, k); \ - aes4 = _mm_aesenc_si128(aes4, k); \ - aes5 = _mm_aesenc_si128(aes5, k); \ - } while (0) - -#define AESECB6_FINAL() \ - do { \ - __m128i k = ctx->keys[10]; \ - data[0] = _mm_aesenclast_si128(aes0, k); \ - data[1] = _mm_aesenclast_si128(aes1, k); \ - data[2] = _mm_aesenclast_si128(aes2, k); \ - data[3] = _mm_aesenclast_si128(aes3, k); \ - data[4] = _mm_aesenclast_si128(aes4, k); \ - data[5] = _mm_aesenclast_si128(aes5, k); \ - } while (0) - -static inline void aesecb6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data) -{ - AESECB6_INIT(); - - for (int i = 1; i < 10; ++i) - AESECB6_UPDATE(i); - - AESECB6_FINAL(); -} - static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, size_t cnt, __m128i ghash) { __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(); @@ -196,69 +152,6 @@ static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, siz return ghash; } -static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data, const __m128i *gdata, __m128i ghash) -{ - __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), X, r, t; - - AESECB6_INIT(); - - for (size_t i = 0; i < 5; ++i) { - - AESECB6_UPDATE(i + 1); - - X = _mm_loadu_si128(gdata + 5 - i); - X = _mm_shuffle_epi8(X, bswap8); - t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x00); - lo = _mm_xor_si128(lo, t); - t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x11); - hi = _mm_xor_si128(hi, t); - t = _mm_shuffle_epi32(X, 78); - t = _mm_xor_si128(t, X); - t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00); - mid = _mm_xor_si128(mid, t); - } - - AESECB6_UPDATE(6); - - X = _mm_loadu_si128(gdata + 0); - X = _mm_shuffle_epi8(X, bswap8); - X = _mm_xor_si128(X, ghash); - t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x00); - lo = _mm_xor_si128(lo, t); - t = _mm_clmulepi64_si128(ctx->ghash[5].H, X, 0x11); - - AESECB6_UPDATE(7); - - hi = _mm_xor_si128(hi, t); - t = _mm_shuffle_epi32(X, 78); - t = _mm_xor_si128(t, X); - t = _mm_clmulepi64_si128(ctx->ghash[5].r, t, 0x00); - mid = _mm_xor_si128(mid, t); - - AESECB6_UPDATE(8); - - mid = _mm_xor_si128(mid, hi); - mid = _mm_xor_si128(mid, lo); - lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); - hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); - - /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ - r = _mm_clmulepi64_si128(lo, poly, 0x10); - - AESECB6_UPDATE(9); - - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - r = _mm_clmulepi64_si128(lo, poly, 0x10); - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - ghash = _mm_xor_si128(hi, lo); - - AESECB6_FINAL(); - - return ghash; -} - static inline __m128i loadn(const void *_p, size_t l) { const uint8_t *p = _p; @@ -279,70 +172,80 @@ static inline void storen(void *_p, size_t l, __m128i v) p[i] = buf[i]; } -static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, const __m128i *dst_ghash, const __m128i *aad, - size_t aadlen, __m128i ghash, __m128i ac, __m128i ek0) +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, + const void *_src, size_t srclen) { - const __m128i *enc = dst_ghash; - size_t enclen = (const uint8_t *)dst - (const uint8_t *)enc; - __m128i gdata[6]; - int gdata_index; - - while (1) { - gdata_index = 0; - if (aadlen != 0) { - while (aadlen >= 16) { - gdata[gdata_index++] = _mm_loadu_si128(aad++); - aadlen -= 16; - if (gdata_index == 6) - goto GHASH6; - } - if (aadlen != 0) { - gdata[gdata_index++] = loadn(aad, aadlen); - aadlen = 0; - if (gdata_index == 6) - goto GHASH6; - } - } - if (enclen != 0) { - while (enclen >= 16) { - gdata[gdata_index++] = _mm_loadu_si128(enc++); - enclen -= 16; - if (gdata_index == 6) - goto GHASH6; - } - if (enclen != 0) { - gdata[gdata_index++] = loadn(enc, enclen); - enclen = 0; - if (gdata_index == 6) - goto GHASH6; - } - } - gdata[gdata_index++] = _mm_shuffle_epi8(ac, bswap64); - break; +/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ +#define AESECB6_INIT() \ + do { \ + ctr = _mm_add_epi64(ctr, one64); \ + bits0 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits1 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits2 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits3 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one64); \ + bits4 = _mm_shuffle_epi8(ctr, bswap64); \ + if (PTLS_LIKELY(srclen > 16 * 5)) { \ + ctr = _mm_add_epi64(ctr, one64); \ + bits5 = _mm_shuffle_epi8(ctr, bswap64); \ + } else if ((state & STATE_EK0_BEEN_FED) == 0) { \ + bits5 = ek0; \ + state |= STATE_EK0_BEEN_FED; \ + } \ + __m128i k = ctx->keys[0]; \ + bits0 = _mm_xor_si128(bits0, k); \ + bits1 = _mm_xor_si128(bits1, k); \ + bits2 = _mm_xor_si128(bits2, k); \ + bits3 = _mm_xor_si128(bits3, k); \ + bits4 = _mm_xor_si128(bits4, k); \ + bits5 = _mm_xor_si128(bits5, k); \ + } while (0) - GHASH6: - ghash = ghashn(ctx, gdata, 6, ghash); - } +/* aes block update */ +#define AESECB6_UPDATE(i) \ + do { \ + __m128i k = ctx->keys[i]; \ + bits0 = _mm_aesenc_si128(bits0, k); \ + bits1 = _mm_aesenc_si128(bits1, k); \ + bits2 = _mm_aesenc_si128(bits2, k); \ + bits3 = _mm_aesenc_si128(bits3, k); \ + bits4 = _mm_aesenc_si128(bits4, k); \ + bits5 = _mm_aesenc_si128(bits5, k); \ + } while (0) - /* final */ - ghash = ghashn(ctx, gdata, gdata_index, ghash); - __m128i tag = _mm_shuffle_epi8(ghash, bswap8); - tag = _mm_xor_si128(tag, ek0); - _mm_storeu_si128(dst, tag); -} +/* aesenclast */ +#define AESECB6_FINAL() \ + do { \ + __m128i k = ctx->keys[10]; \ + bits0 = _mm_aesenclast_si128(bits0, k); \ + bits1 = _mm_aesenclast_si128(bits1, k); \ + bits2 = _mm_aesenclast_si128(bits2, k); \ + bits3 = _mm_aesenclast_si128(bits3, k); \ + bits4 = _mm_aesenclast_si128(bits4, k); \ + bits5 = _mm_aesenclast_si128(bits5, k); \ + } while (0) -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, - const void *_src, size_t srclen) -{ - __m128i ctr, ek0, bits[6], gdatabuf[6], ghash = _mm_setzero_si128(); - int ek0_encrypted = 0; - __m128i ac = _mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8); + __m128i ctr, ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); + __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), gdatabuf[6]; + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8), bswap64); // src and dst are updated after the chunk is processed const __m128i *src = _src; __m128i *dst = _dst; // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor) const __m128i *aad = _aad, *dst_ghash = dst; + size_t dst_ghashlen = srclen; + + struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1; + + int32_t state = 0; +#define STATE_FINAL 0x80000000 /* negates the state when set */ +#define STATE_EK0_BEEN_FED 0x3 +#define STATE_EK0_INCOMPLETE 0x2 +#define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) /* build counter */ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); @@ -350,96 +253,160 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i ctr = _mm_add_epi64(ctr, one64); ek0 = _mm_shuffle_epi8(ctr, bswap64); -/* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */ -#define SETUP_BITS() \ + /* prepare the first bit stream */ + AESECB6_INIT(); + for (size_t i = 1; i < 10; ++i) + AESECB6_UPDATE(i); + AESECB6_FINAL(); + + /* the main loop */ + do { + /* apply the bit stream to src and write to dest */ + if (PTLS_LIKELY(srclen >= 6 * 16)) { +#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i)) + APPLY(0); + APPLY(1); + APPLY(2); + APPLY(3); + APPLY(4); + APPLY(5); +#undef APPLY + dst += 6; + src += 6; + srclen -= 6 * 16; + } else { + if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) { + ek0 = bits5; + state &= ~STATE_EK0_INCOMPLETE; + } + if (srclen != 0) { +#define APPLY(i) \ do { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[0] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[1] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[2] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[3] = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[4] = _mm_shuffle_epi8(ctr, bswap64); \ - if (PTLS_LIKELY(srclen > 16 * 5)) { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits[5] = _mm_shuffle_epi8(ctr, bswap64); \ + if (srclen >= 16) { \ + _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \ + srclen -= 16; \ } else { \ - assert(!ek0_encrypted); \ - bits[5] = ek0; \ - ek0_encrypted = 1; \ + if (srclen != 0) { \ + storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits##i)); \ + dst = (__m128i *)((uint8_t *)dst + srclen); \ + srclen = 0; \ + } \ + goto ApplyEnd; \ } \ } while (0) - - /* build the first AES bits */ - SETUP_BITS(); - aesecb6(ctx, bits); - - /* the main loop */ - while (PTLS_LIKELY(srclen >= 6 * 16)) { - /* apply the bits */ -#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits[i])) - APPLY(0); - APPLY(1); - APPLY(2); - APPLY(3); - APPLY(4); - APPLY(5); + APPLY(0); + APPLY(1); + APPLY(2); + APPLY(3); + APPLY(4); + APPLY(5); + ApplyEnd:; #undef APPLY - dst += 6; - src += 6; - srclen -= 6 * 16; + } + } + + /* next block AES starts here */ + AESECB6_INIT(); - /* setup bits */ - SETUP_BITS(); + AESECB6_UPDATE(1); /* setup gdata */ const __m128i *gdata; + size_t gdata_cnt; if (PTLS_UNLIKELY(aadlen != 0)) { - for (int i = 0; i < 6; ++i) { + gdata_cnt = 0; + while (gdata_cnt < 6) { if (aadlen < 16) { if (aadlen != 0) { - gdatabuf[i++] = loadn(aad, aadlen); + gdatabuf[gdata_cnt++] = loadn(aad, aadlen); aadlen = 0; } - while (i < 6) - gdatabuf[i++] = _mm_loadu_si128(dst_ghash++); - break; + goto GdataFillDST; } - gdatabuf[i++] = _mm_loadu_si128(aad++); + gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); aadlen -= 16; } gdata = gdatabuf; - } else { + } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) { gdata = dst_ghash; + gdata_cnt = 6; dst_ghash += 6; + dst_ghashlen -= 96; + } else { + gdata_cnt = 0; + GdataFillDST: + while (gdata_cnt < 6) { + if (dst_ghashlen < 16) { + if (dst_ghashlen != 0) { + gdatabuf[gdata_cnt++] = loadn(dst_ghash, dst_ghashlen); + dst_ghashlen = 0; + } + if (gdata_cnt < 6) { + gdatabuf[gdata_cnt++] = ac; + state |= STATE_FINAL; + } + break; + } + gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++); + dst_ghashlen -= 16; + } + gdata = gdatabuf; } - /* doit */ - ghash = aesecb6ghash6(ctx, bits, gdata, ghash); - } + /* run AES and multiplification in parallel */ + size_t index = 0; + for (; index < gdata_cnt; ++index) { + + AESECB6_UPDATE(index + 2); + + --ghash_precompute; + __m128i X = _mm_loadu_si128(gdata + index); + X = _mm_shuffle_epi8(X, bswap8); + __m128i t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x00); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x11); + hi = _mm_xor_si128(hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(ghash_precompute->r, t, 0x00); + mid = _mm_xor_si128(mid, t); + } - /* apply the bit stream to the remainder */ - for (int i = 0; i < 6 && srclen != 0; ++i) { - if (srclen < 16) { - storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits[i])); - dst = (__m128i *)((uint8_t *)dst + srclen); - srclen = 0; - break; + AESECB6_UPDATE(index + 2); + + for (; index + 3 <= 9; ++index) + AESECB6_UPDATE(index + 3); + + /* finish bit stream generation */ + AESECB6_FINAL(); + + } while (state >= 0); + + if (!STATE_EK0_READY()) { + if ((state & STATE_EK0_INCOMPLETE) != 0) { + ek0 = bits5; + } else { + assert(!"FIXME either finish the AES loop to get the "); } - _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i])); - srclen -= 16; } - if (ek0_encrypted) { - ek0 = bits[5]; - } else { - assert(!"FIXME calculate ek0"); - } + /* finish multiplification */ + mid = _mm_xor_si128(mid, hi); + mid = _mm_xor_si128(mid, lo); + lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); + hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); - finish_gcm(ctx, dst, dst_ghash, aad, aadlen, ghash, ac, ek0); + /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ + __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + r = _mm_clmulepi64_si128(lo, poly, 0x10); + lo = _mm_shuffle_epi32(lo, 78); + lo = _mm_xor_si128(lo, r); + __m128i tag = _mm_xor_si128(hi, lo); + tag = _mm_shuffle_epi8(tag, bswap8); + tag = _mm_xor_si128(tag, ek0); + _mm_storeu_si128(dst, tag); } static __m128i expand_key(__m128i key, __m128i t) @@ -481,9 +448,9 @@ void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *_use ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); ctx->ghash[0].H = transformH(ctx->ghash[0].H); - for (int i = 1; i < 6; ++i) + for (int i = 1; i < PTLS_ELEMENTSOF(ctx->ghash); ++i) ctx->ghash[i].H = gfmul(ctx->ghash[i - 1].H, ctx->ghash[0].H); - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < PTLS_ELEMENTSOF(ctx->ghash); ++i) { __m128i r = _mm_shuffle_epi32(ctx->ghash[i].H, 78); r = _mm_xor_si128(r, ctx->ghash[i].H); ctx->ghash[i].r = r; diff --git a/t/fusion.c b/t/fusion.c index 0c0387ae2..fed8c8ef5 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -40,7 +40,6 @@ int main(int argc, char **argv) { static const uint8_t userkey[16] = {}; static const uint8_t plaintext[16] = {}; - __m128i ONE = _mm_set_epi32(0, 0, 0, 1), BSWAP64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); ptls_fusion_aesgcm_context_t ctx; ptls_fusion_aesgcm_init(&ctx, userkey); @@ -52,54 +51,6 @@ int main(int argc, char **argv) dump(encrypted, sizeof(encrypted)); } - if (1) { /* test */ - __m128i ecb6[6], ctr = _mm_setzero_si128(); - for (int i = 0; i < 6; ++i) { - ctr = _mm_add_epi64(ctr, ONE); - ecb6[i] = _mm_shuffle_epi8(ctr, bswap8); - } - aesecb6(&ctx, ecb6); - __m128i gdata[6]; - for (int i = 0; i < 5; ++i) - gdata[i] = ecb6[i + 1]; - gdata[5] = _mm_shuffle_epi8(_mm_set_epi32(0, 8 * 16 * 5, 0, 0), BSWAP64); - __m128i dummy[6] = {}, ghash = {}; - ghash = aesecb6ghash6(&ctx, dummy, gdata, ghash); - ghash = _mm_shuffle_epi8(ghash, bswap8); - __m128i tag = _mm_xor_si128(ghash, ecb6[0]); - dump(ecb6 + 1, 16); - dump(ecb6 + 1, 16 * 5); - dump(&tag, 16); - - { - __m128i gx = {}, input[2] = {ecb6[1], _mm_shuffle_epi8(_mm_set_epi32(0, 8 * 16, 0, 0), BSWAP64)}; - gx = ghashn(&ctx, input, 2, gx); - gx = _mm_shuffle_epi8(gx, bswap8); - tag = _mm_xor_si128(gx, ecb6[0]); - dump(&tag, 16); - } - - { - __m128i gx = {}; - gx = ghashn(&ctx, gdata, 6, gx); - gx = _mm_shuffle_epi8(gx, bswap8); - tag = _mm_xor_si128(gx, ecb6[0]); - dump(&tag, 16); - } - - { - __m128i gx = {}; - gx = gfmul(ctx.ghash[0].H, _mm_shuffle_epi8(ecb6[0], bswap8)); - gx = gfmul(ctx.ghash[0].H, _mm_xor_si128(gx, _mm_shuffle_epi8(ecb6[1], bswap8))); - dump(&gx, 16); - } - { - __m128i gx = {}; - gx = ghashn(&ctx, ecb6, 2, gx); - dump(&gx, 16); - } - } - #if 1 { /* benchmark */ static const uint8_t iv[12] = {}, aad[13] = {}, text[16384] = {}; @@ -107,7 +58,7 @@ int main(int argc, char **argv) for (int i = 0; i < 1000000; ++i) { ptls_fusion_aesgcm_encrypt(&ctx, iv, aad, sizeof(aad), encrypted, text, sizeof(text)); if (i == 0) - dump(encrypted + sizeof(text), 16); + dump(encrypted, sizeof(encrypted)); } } #else From 9a1143c8bdd981476fcf7b7830ecfeee84e4df39 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 00:33:31 +0900 Subject: [PATCH 12/60] remove unused function --- lib/fusion.c | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 81f8ff14e..022e1f4ce 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -115,43 +115,6 @@ static __m128i gfmul(__m128i x, __m128i y) return _mm_xor_si128(hi, lo); } -static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, size_t cnt, __m128i ghash) -{ - __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(); - assert(cnt <= 6); - - for (size_t i = 0; i < cnt; ++i) { - __m128i X = _mm_loadu_si128(src + cnt - 1 - i); - X = _mm_shuffle_epi8(X, bswap8); - if (i == cnt - 1) - X = _mm_xor_si128(X, ghash); - __m128i t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x00); - lo = _mm_xor_si128(lo, t); - t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x11); - hi = _mm_xor_si128(hi, t); - t = _mm_shuffle_epi32(X, 78); - t = _mm_xor_si128(t, X); - t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00); - mid = _mm_xor_si128(mid, t); - } - - mid = _mm_xor_si128(mid, hi); - mid = _mm_xor_si128(mid, lo); - lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); - hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); - - /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ - __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - r = _mm_clmulepi64_si128(lo, poly, 0x10); - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - ghash = _mm_xor_si128(hi, lo); - - return ghash; -} - static inline __m128i loadn(const void *_p, size_t l) { const uint8_t *p = _p; From f198c1b64ac1b65ce64cddfe495a53fdb1567c84 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 00:45:29 +0900 Subject: [PATCH 13/60] let the user specify the maximum size --- include/picotls/fusion.h | 14 +++------ lib/fusion.c | 64 ++++++++++++++++++++++++++-------------- t/fusion.c | 49 ++---------------------------- 3 files changed, 49 insertions(+), 78 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index b3ad1ad70..15f53404e 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -32,16 +32,10 @@ extern "C" { #define PTLS_FUSION_AESGCM_ROUNDS 10 /* TODO support AES256 */ -typedef struct ptls_fusion_aesgcm_context_t { - __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; - struct ptls_fusion_aesgcm_ghash_precompute { - __m128i H; - __m128i r; - } ghash[1050]; -} ptls_fusion_aesgcm_context_t; - -void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *key); -void ptls_fusion_aesgcm_dispose(ptls_fusion_aesgcm_context_t *ctx); +typedef struct ptls_fusion_aesgcm_context ptls_fusion_aesgcm_context_t; + +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size); +void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, const void *_src, size_t srclen); diff --git a/lib/fusion.c b/lib/fusion.c index 022e1f4ce..09a4e253c 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -38,12 +38,22 @@ * IN THE SOFTWARE. */ #include +#include #include #include #include #include "picotls.h" #include "picotls/fusion.h" +struct ptls_fusion_aesgcm_context { + __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; + size_t ghash_cnt; + struct ptls_fusion_aesgcm_ghash_precompute { + __m128i H; + __m128i r; + } ghash[0]; +}; + static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; #define poly (*(__m128i *)poly_) static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; @@ -381,60 +391,70 @@ static __m128i expand_key(__m128i key, __m128i t) return _mm_xor_si128(key, t); } -void ptls_fusion_aesgcm_init(ptls_fusion_aesgcm_context_t *ctx, const void *_userkey) +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *userkey, size_t max_size) { - __m128i userkey = _mm_loadu_si128((__m128i *)_userkey); - size_t i = 0; + ptls_fusion_aesgcm_context_t *ctx; + size_t ghash_cnt = (max_size + 15) / 16 + 1; // round-up by block size, plus context to hash AC + + if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) + return NULL; - ctx->keys[i++] = userkey; + { + size_t i = 0; + ctx->keys[i++] = _mm_loadu_si128((__m128i *)userkey); #define EXPAND(R) \ do { \ ctx->keys[i] = expand_key(ctx->keys[i - 1], _mm_aeskeygenassist_si128(ctx->keys[i - 1], R)); \ ++i; \ } while (0) - EXPAND(0x1); - EXPAND(0x2); - EXPAND(0x4); - EXPAND(0x8); - EXPAND(0x10); - EXPAND(0x20); - EXPAND(0x40); - EXPAND(0x80); - EXPAND(0x1b); - EXPAND(0x36); + EXPAND(0x1); + EXPAND(0x2); + EXPAND(0x4); + EXPAND(0x8); + EXPAND(0x10); + EXPAND(0x20); + EXPAND(0x40); + EXPAND(0x80); + EXPAND(0x1b); + EXPAND(0x36); #undef EXPAND + } + ctx->ghash_cnt = ghash_cnt; ctx->ghash[0].H = ctx->keys[0]; - for (i = 1; i < PTLS_FUSION_AESGCM_ROUNDS; ++i) + for (size_t i = 1; i < PTLS_FUSION_AESGCM_ROUNDS; ++i) ctx->ghash[0].H = _mm_aesenc_si128(ctx->ghash[0].H, ctx->keys[i]); ctx->ghash[0].H = _mm_aesenclast_si128(ctx->ghash[0].H, ctx->keys[PTLS_FUSION_AESGCM_ROUNDS]); ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); ctx->ghash[0].H = transformH(ctx->ghash[0].H); - for (int i = 1; i < PTLS_ELEMENTSOF(ctx->ghash); ++i) + for (int i = 1; i < ghash_cnt; ++i) ctx->ghash[i].H = gfmul(ctx->ghash[i - 1].H, ctx->ghash[0].H); - for (int i = 0; i < PTLS_ELEMENTSOF(ctx->ghash); ++i) { + for (int i = 0; i < ghash_cnt; ++i) { __m128i r = _mm_shuffle_epi32(ctx->ghash[i].H, 78); r = _mm_xor_si128(r, ctx->ghash[i].H); ctx->ghash[i].r = r; } + + return ctx; } -void ptls_fusion_aesgcm_dispose(ptls_fusion_aesgcm_context_t *ctx) +void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx) { - ptls_clear_memory(ctx, sizeof(*ctx)); + ptls_clear_memory(ctx, sizeof(*ctx) + sizeof(ctx->ghash[0]) * ctx->ghash_cnt); + free(ctx); } struct aesgcm_context { ptls_aead_context_t super; - ptls_fusion_aesgcm_context_t aesgcm; + ptls_fusion_aesgcm_context_t *aesgcm; }; static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; - ptls_fusion_aesgcm_dispose(&ctx->aesgcm); + ptls_fusion_aesgcm_destroy(ctx->aesgcm); } static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) @@ -479,7 +499,7 @@ static int aes128gcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const v } assert(is_enc); - ptls_fusion_aesgcm_init(&ctx->aesgcm, key); + ctx->aesgcm = ptls_fusion_aesgcm_create(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ return 0; } diff --git a/t/fusion.c b/t/fusion.c index fed8c8ef5..e0a47167f 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -40,67 +40,24 @@ int main(int argc, char **argv) { static const uint8_t userkey[16] = {}; static const uint8_t plaintext[16] = {}; - ptls_fusion_aesgcm_context_t ctx; - - ptls_fusion_aesgcm_init(&ctx, userkey); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(userkey, 16384); { static const uint8_t iv[12] = {}; uint8_t encrypted[sizeof(plaintext) + 16]; - ptls_fusion_aesgcm_encrypt(&ctx, iv, "hello", 5, encrypted, plaintext, sizeof(plaintext)); + ptls_fusion_aesgcm_encrypt(ctx, iv, "hello", 5, encrypted, plaintext, sizeof(plaintext)); dump(encrypted, sizeof(encrypted)); } -#if 1 { /* benchmark */ static const uint8_t iv[12] = {}, aad[13] = {}, text[16384] = {}; uint8_t encrypted[sizeof(text) + 16]; for (int i = 0; i < 1000000; ++i) { - ptls_fusion_aesgcm_encrypt(&ctx, iv, aad, sizeof(aad), encrypted, text, sizeof(text)); + ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), encrypted, text, sizeof(text)); if (i == 0) dump(encrypted, sizeof(encrypted)); } } -#else - { /* benchmark (using ~16384 bytes block) */ - __m128i test[171 * 6] = {}, ghash = {}; - __m128i ctr = _mm_setzero_si128(); - for (int i = 0; i < 171 * 6; ++i) - memcpy(test + i, plaintext, 16); - for (int i = 0; i < 1000000; ++i) { - for (int j = 0; j < 171;) { - __m128i bits[6]; - ctr = _mm_add_epi64(ctr, ONE); - bits[0] = _mm_shuffle_epi8(ctr, BSWAP64); - ctr = _mm_add_epi64(ctr, ONE); - bits[1] = _mm_shuffle_epi8(ctr, BSWAP64); - ctr = _mm_add_epi64(ctr, ONE); - bits[2] = _mm_shuffle_epi8(ctr, BSWAP64); - ctr = _mm_add_epi64(ctr, ONE); - bits[3] = _mm_shuffle_epi8(ctr, BSWAP64); - ctr = _mm_add_epi64(ctr, ONE); - bits[4] = _mm_shuffle_epi8(ctr, BSWAP64); - ctr = _mm_add_epi64(ctr, ONE); - bits[5] = _mm_shuffle_epi8(ctr, BSWAP64); - ghash = aesecb6ghash6(&ctx, bits, j == 0 ? test + 171 * 6 - 6 : test + j - 6, ghash); - // aesecb4(&ctx, bits); - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[0])); - ++j; - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[1])); - ++j; - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[2])); - ++j; - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); - ++j; - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); - ++j; - _mm_storeu_si128(test + j, _mm_xor_si128(_mm_loadu_si128(test + j), bits[3])); - ++j; - } - } - dump(&ghash, sizeof(ghash)); - } -#endif return 0; } From f5f0f64d8b72e4ab1f95a20aec2f73821ded93f1 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 00:48:46 +0900 Subject: [PATCH 14/60] add benchmark --- t/fusionbench.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 t/fusionbench.c diff --git a/t/fusionbench.c b/t/fusionbench.c new file mode 100644 index 000000000..8de40289d --- /dev/null +++ b/t/fusionbench.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include "picotls/fusion.h" + +int main(int argc, char **argv) +{ + static const uint8_t key[16] = {}, iv[12] = {}, aad[13] = {}; + size_t textlen = 16384; + + if (sscanf(argv[1], "%zu", &textlen) != 1) { + fprintf(stderr, "failed to obtain text length from argument\n"); + return 1; + } + + uint8_t *text = malloc(textlen + 16); + memset(text, 0, textlen + 16); + + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); + +#if 0 + for (int i = 0; i < 10000; ++i) { + ptls_fusion_aesgcm_encrypt_vec_t vec[100]; + for (int j = 0; j < 100; ++j) { + vec[j].iv = iv; + vec[j].aad = aad; + vec[j].aadlen = sizeof(aad); + vec[j].dst = text; + vec[j].src = text; + vec[j].srclen = textlen; + } + ptls_fusion_aesgcm_encrypt(ctx, vec, 100); + } +#elif 0 + for (int i = 0; i < 1000000; ++i) { + ptls_fusion_aesgcm_encrypt_vec_t vec = { + .iv = iv, + .aad = aad, + .aadlen = sizeof(aad), + .dst = text, + .src = text, + .srclen = textlen, + }; + ptls_fusion_aesgcm_encrypt(ctx, &vec, 1); + } +#else + for (int i = 0; i < 1000000; ++i) + ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen); +#endif + + for (int i = 0; i < 16; ++i) + printf("%02x", text[textlen + i]); + printf("\n"); + + return 0; +} From 8363d78a99a180cbe5cb05637998cfed22309a0f Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 05:53:53 +0900 Subject: [PATCH 15/60] comments --- lib/fusion.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 09a4e253c..936d726a8 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -326,7 +326,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i gdata = gdatabuf; } - /* run AES and multiplification in parallel */ + /* run AES and multiplication in parallel */ size_t index = 0; for (; index < gdata_cnt; ++index) { @@ -359,11 +359,13 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i if ((state & STATE_EK0_INCOMPLETE) != 0) { ek0 = bits5; } else { - assert(!"FIXME either finish the AES loop to get the "); + /* Even when a zero-byte AAD is being used, AES will be running one 96-byte block ahead of GHASH. That means that the + * AES-side would have the room to encrypt ek0 as late as when the GHASH-side is hashing AC. */ + assert(!"logic flaw"); } } - /* finish multiplification */ + /* finish multiplication */ mid = _mm_xor_si128(mid, hi); mid = _mm_xor_si128(mid, lo); lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); From e46529ca06296aaea604980a28231267cca8f7ec Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 13:38:39 +0900 Subject: [PATCH 16/60] add aesecb api --- include/picotls/fusion.h | 7 +++++ lib/fusion.c | 67 +++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 15f53404e..34f081a06 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -32,8 +32,15 @@ extern "C" { #define PTLS_FUSION_AESGCM_ROUNDS 10 /* TODO support AES256 */ +typedef struct ptls_fusion_aesecb_context { + __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; +} ptls_fusion_aesecb_context_t; + typedef struct ptls_fusion_aesgcm_context ptls_fusion_aesgcm_context_t; +void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key); +void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx); + ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size); void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, diff --git a/lib/fusion.c b/lib/fusion.c index 936d726a8..ec0bc06b8 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -46,7 +46,7 @@ #include "picotls/fusion.h" struct ptls_fusion_aesgcm_context { - __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; + ptls_fusion_aesecb_context_t ecb; size_t ghash_cnt; struct ptls_fusion_aesgcm_ghash_precompute { __m128i H; @@ -168,7 +168,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i bits5 = ek0; \ state |= STATE_EK0_BEEN_FED; \ } \ - __m128i k = ctx->keys[0]; \ + __m128i k = ctx->ecb.keys[0]; \ bits0 = _mm_xor_si128(bits0, k); \ bits1 = _mm_xor_si128(bits1, k); \ bits2 = _mm_xor_si128(bits2, k); \ @@ -180,7 +180,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* aes block update */ #define AESECB6_UPDATE(i) \ do { \ - __m128i k = ctx->keys[i]; \ + __m128i k = ctx->ecb.keys[i]; \ bits0 = _mm_aesenc_si128(bits0, k); \ bits1 = _mm_aesenc_si128(bits1, k); \ bits2 = _mm_aesenc_si128(bits2, k); \ @@ -192,7 +192,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* aesenclast */ #define AESECB6_FINAL() \ do { \ - __m128i k = ctx->keys[10]; \ + __m128i k = ctx->ecb.keys[10]; \ bits0 = _mm_aesenclast_si128(bits0, k); \ bits1 = _mm_aesenclast_si128(bits1, k); \ bits2 = _mm_aesenclast_si128(bits2, k); \ @@ -393,40 +393,49 @@ static __m128i expand_key(__m128i key, __m128i t) return _mm_xor_si128(key, t); } -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *userkey, size_t max_size) +void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key) { - ptls_fusion_aesgcm_context_t *ctx; - size_t ghash_cnt = (max_size + 15) / 16 + 1; // round-up by block size, plus context to hash AC - - if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) - return NULL; + size_t i = 0; - { - size_t i = 0; - ctx->keys[i++] = _mm_loadu_si128((__m128i *)userkey); + ctx->keys[i++] = _mm_loadu_si128((__m128i *)key); #define EXPAND(R) \ do { \ ctx->keys[i] = expand_key(ctx->keys[i - 1], _mm_aeskeygenassist_si128(ctx->keys[i - 1], R)); \ ++i; \ } while (0) - EXPAND(0x1); - EXPAND(0x2); - EXPAND(0x4); - EXPAND(0x8); - EXPAND(0x10); - EXPAND(0x20); - EXPAND(0x40); - EXPAND(0x80); - EXPAND(0x1b); - EXPAND(0x36); + EXPAND(0x1); + EXPAND(0x2); + EXPAND(0x4); + EXPAND(0x8); + EXPAND(0x10); + EXPAND(0x20); + EXPAND(0x40); + EXPAND(0x80); + EXPAND(0x1b); + EXPAND(0x36); #undef EXPAND - } +} + +void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) +{ + ptls_clear_memory(ctx, sizeof(*ctx)); +} + +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size) +{ + ptls_fusion_aesgcm_context_t *ctx; + size_t ghash_cnt = (max_size + 15) / 16 + 1; // round-up by block size, plus context to hash AC + + if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) + return NULL; + + ptls_fusion_aesecb_init(&ctx->ecb, key); ctx->ghash_cnt = ghash_cnt; - ctx->ghash[0].H = ctx->keys[0]; + ctx->ghash[0].H = ctx->ecb.keys[0]; for (size_t i = 1; i < PTLS_FUSION_AESGCM_ROUNDS; ++i) - ctx->ghash[0].H = _mm_aesenc_si128(ctx->ghash[0].H, ctx->keys[i]); - ctx->ghash[0].H = _mm_aesenclast_si128(ctx->ghash[0].H, ctx->keys[PTLS_FUSION_AESGCM_ROUNDS]); + ctx->ghash[0].H = _mm_aesenc_si128(ctx->ghash[0].H, ctx->ecb.keys[i]); + ctx->ghash[0].H = _mm_aesenclast_si128(ctx->ghash[0].H, ctx->ecb.keys[PTLS_FUSION_AESGCM_ROUNDS]); ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); ctx->ghash[0].H = transformH(ctx->ghash[0].H); @@ -443,7 +452,9 @@ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *userkey, siz void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx) { - ptls_clear_memory(ctx, sizeof(*ctx) + sizeof(ctx->ghash[0]) * ctx->ghash_cnt); + ptls_clear_memory(ctx->ghash, sizeof(ctx->ghash[0]) * ctx->ghash_cnt); + ctx->ghash_cnt = 0; + ptls_fusion_aesecb_dispose(&ctx->ecb); free(ctx); } From 8289564d4b683bc11a67bc3a6bf869289c50853a Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 15:25:02 +0900 Subject: [PATCH 17/60] tests! --- CMakeLists.txt | 19 +++++++- picotls.xcodeproj/project.pbxproj | 2 + t/fusion.c | 76 +++++++++++++++++++++---------- 3 files changed, 72 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c25dba04..275a6d7f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ ENDIF () ADD_LIBRARY(picotls-core ${CORE_FILES}) TARGET_LINK_LIBRARIES(picotls-core ${CORE_EXTRA_LIBS}) + ADD_LIBRARY(picotls-minicrypto ${MINICRYPTO_LIBRARY_FILES} lib/cifra.c @@ -90,7 +91,6 @@ ADD_EXECUTABLE(test-minicrypto.t lib/cifra/aes128.c lib/cifra/aes256.c lib/cifra/random.c) - SET(TEST_EXES test-minicrypto.t) FIND_PACKAGE(OpenSSL) @@ -131,6 +131,23 @@ ELSE () MESSAGE(WARNING "Disabling OpenSSL support (requires 1.0.1 or newer)") ENDIF () +IF ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND + (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") OR + (CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") OR + (CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")) + MESSAGE(STATUS " Enabling fusion support") + ADD_LIBRARY(picotls-fusion lib/fusion.c) + SET_TARGET_PROPERTIES(picotls-fusion PROPERTIES COMPILE_FLAGS "-maes -mpclmul") + TARGET_LINK_LIBRARIES(picotls-fusion picotls-core) + ADD_EXECUTABLE(test-fusion.t + deps/picotest/picotest.c + lib/picotls.c + lib/fusion.c + t/fusion.c) + SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-maes -mpclmul") + SET(TEST_EXES ${TEST_EXES} test-fusion.t) +ENDIF () + ADD_CUSTOM_TARGET(check env BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} prove --exec '' -v ${CMAKE_CURRENT_BINARY_DIR}/*.t t/*.t WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TEST_EXES} cli) IF (CMAKE_SYSTEM_NAME STREQUAL "Linux") diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 62b303107..19f26df3f 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -139,6 +139,7 @@ E9B43DE324619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; + E9B43DE724652D2000824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; E9BC76CF1EF3A35E00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D21EF3A36A00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D41EF3A37200EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; @@ -979,6 +980,7 @@ E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */, E9B43DC324619D5100824E51 /* aes.c in Sources */, E9B43DC424619D5100824E51 /* pembase64.c in Sources */, + E9B43DE724652D2000824E51 /* fusion.c in Sources */, E9B43DE424619D7E00824E51 /* fusion.c in Sources */, E9B43DC524619D5100824E51 /* ffx.c in Sources */, E9B43DC624619D5100824E51 /* picotls.c in Sources */, diff --git a/t/fusion.c b/t/fusion.c index e0a47167f..9827cf3fb 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -22,42 +22,70 @@ #include #include #include +#include "picotls/fusion.h" #include "../deps/picotest/picotest.h" -#include "../lib/fusion.c" -static void dump(const void *_p, size_t len) +static const char *tostr(const void *_p, size_t len) { - const uint8_t *p = _p; - for (size_t i = 0; i != len; ++i) { - if (i % 16 == 0 && i != 0) - printf("-"); - printf("%02x", p[i]); + static char *buf; + + if (buf != NULL) + free(buf); + buf = malloc(len * 2 + 1); + + const uint8_t *s = _p; + char *d = buf; + + for (; len != 0; --len) { + *d++ = "0123456789abcdef"[*s >> 4]; + *d++ = "0123456789abcdef"[*s & 0xf]; + ++s; } - printf("\n"); + *d = '\0'; + + return buf; } int main(int argc, char **argv) { - static const uint8_t userkey[16] = {}; - static const uint8_t plaintext[16] = {}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(userkey, 16384); + static const uint8_t zero[16384] = {}; + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 16384); { - static const uint8_t iv[12] = {}; - uint8_t encrypted[sizeof(plaintext) + 16]; - ptls_fusion_aesgcm_encrypt(ctx, iv, "hello", 5, encrypted, plaintext, sizeof(plaintext)); - dump(encrypted, sizeof(encrypted)); + uint8_t encrypted[32]; + ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16); + ok(strcmp(tostr(encrypted, sizeof(encrypted)), "0388dace60b6a392f328c2b971b2fe78973fbca65477bf4785b0d561f7e3fd6c") == 0); } - { /* benchmark */ - static const uint8_t iv[12] = {}, aad[13] = {}, text[16384] = {}; - uint8_t encrypted[sizeof(text) + 16]; - for (int i = 0; i < 1000000; ++i) { - ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), encrypted, text, sizeof(text)); - if (i == 0) - dump(encrypted, sizeof(encrypted)); - } + { + uint8_t encrypted[sizeof(zero) + 16]; +#define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ + do { \ + ptls_fusion_aesgcm_encrypt(ctx, iv, aad, aadlen, encrypted, zero, ptlen); \ + ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ + } while (0) + + DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf"); + DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8"); + DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b"); + DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd"); + DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9"); + DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd"); + DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78"); + DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d"); + DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42"); + DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8"); + + DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316"); + DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253"); + DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef"); + + DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890"); + DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789"); + DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc"); + +#undef DOIT } - return 0; + return done_testing(); } From 303153da3b946bdcb1ad9556bae4ccc0324d66c9 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 16:42:16 +0900 Subject: [PATCH 18/60] abondon unnecessary AES calculation --- lib/fusion.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index ec0bc06b8..8a6bcc39c 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -233,7 +233,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i AESECB6_FINAL(); /* the main loop */ - do { + while (1) { /* apply the bit stream to src and write to dest */ if (PTLS_LIKELY(srclen >= 6 * 16)) { #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i)) @@ -345,6 +345,10 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i mid = _mm_xor_si128(mid, t); } + /* Bail out if AC has been fed to GHASH. All required AES calculations have been complete by now, so ditch them. */ + if (PTLS_UNLIKELY(state < 0)) + break; + AESECB6_UPDATE(index + 2); for (; index + 3 <= 9; ++index) @@ -353,7 +357,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* finish bit stream generation */ AESECB6_FINAL(); - } while (state >= 0); + } if (!STATE_EK0_READY()) { if ((state & STATE_EK0_INCOMPLETE) != 0) { From fb5bc58e9c799a6a7ea1e84904071e1fa7d72591 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 8 May 2020 17:29:43 +0900 Subject: [PATCH 19/60] add test case --- t/fusion.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/t/fusion.c b/t/fusion.c index 9827cf3fb..3df1af69f 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -84,6 +84,8 @@ int main(int argc, char **argv) DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789"); DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc"); + DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); + #undef DOIT } From bb320d8eae8ffbeffcd3adcab672bf6c159d1184 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sat, 9 May 2020 03:45:25 +0900 Subject: [PATCH 20/60] fix off-by-one block --- lib/fusion.c | 3 ++- t/fusion.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 8a6bcc39c..776e543e7 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -428,7 +428,8 @@ void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size) { ptls_fusion_aesgcm_context_t *ctx; - size_t ghash_cnt = (max_size + 15) / 16 + 1; // round-up by block size, plus context to hash AC + size_t ghash_cnt = (max_size + 15) / 16 + 2; // round-up by block size, add to handle worst split of the size between AAD and + // payload, plus context to hash AC if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) return NULL; diff --git a/t/fusion.c b/t/fusion.c index 3df1af69f..743806c16 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -49,16 +49,28 @@ static const char *tostr(const void *_p, size_t len) int main(int argc, char **argv) { static const uint8_t zero[16384] = {}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 16384); { + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); uint8_t encrypted[32]; ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16); ok(strcmp(tostr(encrypted, sizeof(encrypted)), "0388dace60b6a392f328c2b971b2fe78973fbca65477bf4785b0d561f7e3fd6c") == 0); + ptls_fusion_aesgcm_destroy(ctx); } + { /* test capacity */ + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); + uint8_t encrypted[17]; + ptls_fusion_aesgcm_encrypt(ctx, zero, "a", 1, encrypted, "X", 1); + ok(strcmp(tostr(encrypted + 1, 16), "27215ed81a702e3941c80577d52fcb57") == 0); + ptls_fusion_aesgcm_destroy(ctx); + } + + { + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, sizeof(zero)); uint8_t encrypted[sizeof(zero) + 16]; + #define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ do { \ ptls_fusion_aesgcm_encrypt(ctx, iv, aad, aadlen, encrypted, zero, ptlen); \ @@ -87,6 +99,8 @@ int main(int argc, char **argv) DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); #undef DOIT + + ptls_fusion_aesgcm_destroy(ctx); } return done_testing(); From a1a81e620800c5f2a989c73e54d5fe0306409ae6 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sat, 9 May 2020 03:46:46 +0900 Subject: [PATCH 21/60] wip --- include/picotls/fusion.h | 23 +++++++++++-- lib/fusion.c | 56 ++++++++++++++++++++----------- t/fusion.c | 71 ++++++++++++++++++++++++---------------- t/fusionbench.c | 38 +++++---------------- 4 files changed, 109 insertions(+), 79 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 34f081a06..415993434 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -41,10 +41,29 @@ typedef struct ptls_fusion_aesgcm_context ptls_fusion_aesgcm_context_t; void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key); void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx); +/** + * Creates an AES-GCM context. + * @param key the AES key (128 bits) + * @param max_size maximum size of the record (i.e. AAD + encrypted payload) + */ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size); +/** + * Destroys an AES-GCM context. + */ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, - const void *_src, size_t srclen); +/** + * Encrypts an AEAD block, and in parallel, optionally encrypts one block using AES-ECB. + * @param iv initialization vector of 12 bytes + * @param aad AAD + * @param aadlen size of AAD + * @param dst output buffer + * @param src payload to be encrypted + * @param srclen size of the payload to be encrypted + * @param suppkey (optional) points to an AES-ECB context used for generating suppvec + * @param suppvec (optional) vector to be encrypted using suppkey + */ +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *dst, + const void *src, size_t srclen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; diff --git a/lib/fusion.c b/lib/fusion.c index 776e543e7..4b4c6a2b4 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -146,7 +146,7 @@ static inline void storen(void *_p, size_t l, __m128i v) } void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, - const void *_src, size_t srclen) + const void *_src, size_t srclen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec) { /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ #define AESECB6_INIT() \ @@ -164,16 +164,24 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i if (PTLS_LIKELY(srclen > 16 * 5)) { \ ctr = _mm_add_epi64(ctr, one64); \ bits5 = _mm_shuffle_epi8(ctr, bswap64); \ - } else if ((state & STATE_EK0_BEEN_FED) == 0) { \ - bits5 = ek0; \ - state |= STATE_EK0_BEEN_FED; \ + } else { \ + if ((state & STATE_EK0_BEEN_FED) == 0) { \ + bits5 = ek0; \ + state |= STATE_EK0_BEEN_FED; \ + } \ + if (suppkey != NULL && srclen <= 16 * 4) { \ + bits4 = _mm_loadu_si128(suppvec); \ + bits4keys = suppkey->keys; \ + suppkey = NULL; \ + state |= STATE_SUPP_IN_PROCESS; \ + } \ } \ __m128i k = ctx->ecb.keys[0]; \ bits0 = _mm_xor_si128(bits0, k); \ bits1 = _mm_xor_si128(bits1, k); \ bits2 = _mm_xor_si128(bits2, k); \ bits3 = _mm_xor_si128(bits3, k); \ - bits4 = _mm_xor_si128(bits4, k); \ + bits4 = _mm_xor_si128(bits4, bits4keys[0]); \ bits5 = _mm_xor_si128(bits5, k); \ } while (0) @@ -185,7 +193,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i bits1 = _mm_aesenc_si128(bits1, k); \ bits2 = _mm_aesenc_si128(bits2, k); \ bits3 = _mm_aesenc_si128(bits3, k); \ - bits4 = _mm_aesenc_si128(bits4, k); \ + bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \ bits5 = _mm_aesenc_si128(bits5, k); \ } while (0) @@ -197,11 +205,12 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i bits1 = _mm_aesenclast_si128(bits1, k); \ bits2 = _mm_aesenclast_si128(bits2, k); \ bits3 = _mm_aesenclast_si128(bits3, k); \ - bits4 = _mm_aesenclast_si128(bits4, k); \ + bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); \ bits5 = _mm_aesenclast_si128(bits5, k); \ } while (0) __m128i ctr, ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); + const __m128i *bits4keys = ctx->ecb.keys; /* is changed to suppkey->keys when calcurating suppout */ __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8), bswap64); @@ -219,6 +228,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i #define STATE_EK0_BEEN_FED 0x3 #define STATE_EK0_INCOMPLETE 0x2 #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) +#define STATE_SUPP_IN_PROCESS 0x4 /* build counter */ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); @@ -252,6 +262,10 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i ek0 = bits5; state &= ~STATE_EK0_INCOMPLETE; } + if ((state & STATE_SUPP_IN_PROCESS) != 0) { + _mm_storeu_si128(suppvec, bits4); + state &= ~STATE_SUPP_IN_PROCESS; + } if (srclen != 0) { #define APPLY(i) \ do { \ @@ -345,9 +359,22 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i mid = _mm_xor_si128(mid, t); } - /* Bail out if AC has been fed to GHASH. All required AES calculations have been complete by now, so ditch them. */ - if (PTLS_UNLIKELY(state < 0)) + /* bail out if AC has been fed to GHASH */ + if (PTLS_UNLIKELY(state < 0)) { + /* All AES operations for payload encryption and ek0 are complete by now. This is because it is necessary for GCM to + * process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block + * ahead. */ + assert(STATE_EK0_READY()); + /* But calculation of suppvec might be in progress. If so, finish that. */ + assert(suppkey == NULL); + if ((state & STATE_SUPP_IN_PROCESS) != 0) { + for (; index + 2 <= 9; ++index) + bits4 = _mm_aesenc_si128(bits4, bits4keys[index + 2]); + bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); + _mm_storeu_si128(suppvec, bits4); + } break; + } AESECB6_UPDATE(index + 2); @@ -356,17 +383,6 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* finish bit stream generation */ AESECB6_FINAL(); - - } - - if (!STATE_EK0_READY()) { - if ((state & STATE_EK0_INCOMPLETE) != 0) { - ek0 = bits5; - } else { - /* Even when a zero-byte AAD is being used, AES will be running one 96-byte block ahead of GHASH. That means that the - * AES-side would have the room to encrypt ek0 as late as when the GHASH-side is hashing AC. */ - assert(!"logic flaw"); - } } /* finish multiplication */ diff --git a/t/fusion.c b/t/fusion.c index 743806c16..b6d0a8f11 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -48,12 +48,12 @@ static const char *tostr(const void *_p, size_t len) int main(int argc, char **argv) { - static const uint8_t zero[16384] = {}; + static const uint8_t zero[16384] = {}, one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; { ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); uint8_t encrypted[32]; - ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16); + ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16, NULL, NULL); ok(strcmp(tostr(encrypted, sizeof(encrypted)), "0388dace60b6a392f328c2b971b2fe78973fbca65477bf4785b0d561f7e3fd6c") == 0); ptls_fusion_aesgcm_destroy(ctx); } @@ -61,46 +61,61 @@ int main(int argc, char **argv) { /* test capacity */ ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); uint8_t encrypted[17]; - ptls_fusion_aesgcm_encrypt(ctx, zero, "a", 1, encrypted, "X", 1); + ptls_fusion_aesgcm_encrypt(ctx, zero, "a", 1, encrypted, "X", 1, NULL, NULL); ok(strcmp(tostr(encrypted + 1, 16), "27215ed81a702e3941c80577d52fcb57") == 0); ptls_fusion_aesgcm_destroy(ctx); } { - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, sizeof(zero)); - uint8_t encrypted[sizeof(zero) + 16]; - -#define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ - do { \ - ptls_fusion_aesgcm_encrypt(ctx, iv, aad, aadlen, encrypted, zero, ptlen); \ - ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ + ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(zero, sizeof(zero)); + ptls_fusion_aesecb_context_t *ecb = NULL; + + for (int i = 0; i < 2; ++i) { + uint8_t encrypted[sizeof(zero) + 16], ecbvec[16]; +#define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ + do { \ + memset(ecbvec, 0, sizeof(ecbvec)); \ + ptls_fusion_aesgcm_encrypt(aead, iv, aad, aadlen, encrypted, zero, ptlen, ecb, &ecbvec); \ + ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ + if (i == 0) { \ + ok(memcmp(ecbvec, zero, sizeof(ecbvec)) == 0); \ + } else { \ + ok(strcmp(tostr(ecbvec, sizeof(ecbvec)), "b6aeaffa752dc08b51639731761aed00") == 0); \ + } \ } while (0) - DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf"); - DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8"); - DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b"); - DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd"); - DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9"); - DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd"); - DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78"); - DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d"); - DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42"); - DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8"); + DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf"); + DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8"); + DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b"); + DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd"); + DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9"); + DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd"); + DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78"); + DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d"); + DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42"); + DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8"); - DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316"); - DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253"); - DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef"); + DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316"); + DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253"); + DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef"); - DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890"); - DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789"); - DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc"); + DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890"); + DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789"); + DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc"); - DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); + DOIT(zero, zero, 0, 80, "98885a3a22bd4742fe7b72172193b163"); + DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); #undef DOIT - ptls_fusion_aesgcm_destroy(ctx); + ecb = malloc(sizeof(*ecb)); + ptls_fusion_aesecb_init(ecb, one); + } + + ptls_fusion_aesecb_dispose(ecb); + free(ecb); + ptls_fusion_aesgcm_destroy(aead); } return done_testing(); diff --git a/t/fusionbench.c b/t/fusionbench.c index 8de40289d..1ee693b83 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -8,46 +8,26 @@ int main(int argc, char **argv) { static const uint8_t key[16] = {}, iv[12] = {}, aad[13] = {}; size_t textlen = 16384; + ptls_fusion_aesecb_context_t *suppkey; + uint8_t suppvec[16] = {}; - if (sscanf(argv[1], "%zu", &textlen) != 1) { + if (argc >= 2 && sscanf(argv[1], "%zu", &textlen) != 1) { fprintf(stderr, "failed to obtain text length from argument\n"); return 1; } + if (argc >= 3 && strcmp(argv[2], "1") == 0) { + static const uint8_t k[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + suppkey = malloc(sizeof(*suppkey)); + ptls_fusion_aesecb_init(suppkey, k); + } uint8_t *text = malloc(textlen + 16); memset(text, 0, textlen + 16); ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); -#if 0 - for (int i = 0; i < 10000; ++i) { - ptls_fusion_aesgcm_encrypt_vec_t vec[100]; - for (int j = 0; j < 100; ++j) { - vec[j].iv = iv; - vec[j].aad = aad; - vec[j].aadlen = sizeof(aad); - vec[j].dst = text; - vec[j].src = text; - vec[j].srclen = textlen; - } - ptls_fusion_aesgcm_encrypt(ctx, vec, 100); - } -#elif 0 - for (int i = 0; i < 1000000; ++i) { - ptls_fusion_aesgcm_encrypt_vec_t vec = { - .iv = iv, - .aad = aad, - .aadlen = sizeof(aad), - .dst = text, - .src = text, - .srclen = textlen, - }; - ptls_fusion_aesgcm_encrypt(ctx, &vec, 1); - } -#else for (int i = 0; i < 1000000; ++i) - ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen); -#endif + ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, suppkey, suppvec); for (int i = 0; i < 16; ++i) printf("%02x", text[textlen + i]); From 91c3b1814c03286edbf0ca6203b61a6973c32d77 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 10 May 2020 05:25:48 +0900 Subject: [PATCH 22/60] bail out as soon as learning that only GHASH calculation is necessary --- lib/fusion.c | 73 ++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 4b4c6a2b4..5364781a4 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -214,6 +214,9 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8), bswap64); + const __m128i *gdata; // points to the elements fed into GHASH + size_t gdata_cnt; + // src and dst are updated after the chunk is processed const __m128i *src = _src; __m128i *dst = _dst; @@ -224,7 +227,6 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1; int32_t state = 0; -#define STATE_FINAL 0x80000000 /* negates the state when set */ #define STATE_EK0_BEEN_FED 0x3 #define STATE_EK0_INCOMPLETE 0x2 #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) @@ -298,8 +300,6 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i AESECB6_UPDATE(1); /* setup gdata */ - const __m128i *gdata; - size_t gdata_cnt; if (PTLS_UNLIKELY(aadlen != 0)) { gdata_cnt = 0; while (gdata_cnt < 6) { @@ -328,10 +328,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i gdatabuf[gdata_cnt++] = loadn(dst_ghash, dst_ghashlen); dst_ghashlen = 0; } - if (gdata_cnt < 6) { - gdatabuf[gdata_cnt++] = ac; - state |= STATE_FINAL; - } + if (gdata_cnt < 6) + goto Finish; break; } gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++); @@ -341,13 +339,11 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i } /* run AES and multiplication in parallel */ - size_t index = 0; - for (; index < gdata_cnt; ++index) { - - AESECB6_UPDATE(index + 2); + for (size_t i = 2; i <= 7; ++i) { + AESECB6_UPDATE(i); --ghash_precompute; - __m128i X = _mm_loadu_si128(gdata + index); + __m128i X = _mm_loadu_si128(gdata++); X = _mm_shuffle_epi8(X, bswap8); __m128i t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x00); lo = _mm_xor_si128(lo, t); @@ -359,32 +355,43 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i mid = _mm_xor_si128(mid, t); } - /* bail out if AC has been fed to GHASH */ - if (PTLS_UNLIKELY(state < 0)) { - /* All AES operations for payload encryption and ek0 are complete by now. This is because it is necessary for GCM to - * process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block - * ahead. */ - assert(STATE_EK0_READY()); - /* But calculation of suppvec might be in progress. If so, finish that. */ - assert(suppkey == NULL); - if ((state & STATE_SUPP_IN_PROCESS) != 0) { - for (; index + 2 <= 9; ++index) - bits4 = _mm_aesenc_si128(bits4, bits4keys[index + 2]); - bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); - _mm_storeu_si128(suppvec, bits4); - } - break; - } - - AESECB6_UPDATE(index + 2); - - for (; index + 3 <= 9; ++index) - AESECB6_UPDATE(index + 3); + AESECB6_UPDATE(8); + AESECB6_UPDATE(9); /* finish bit stream generation */ AESECB6_FINAL(); } +Finish: + gdatabuf[gdata_cnt++] = ac; + + /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation (GHASH and possibly suppvec), and + * exit the loop. + * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM + * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead. + */ + assert(STATE_EK0_READY()); + assert(suppkey == NULL); + if ((state & STATE_SUPP_IN_PROCESS) != 0) { + for (size_t i = 2; i <= 9; ++i) + bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); + bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); + _mm_storeu_si128(suppvec, bits4); + } + for (size_t i = 0; i < gdata_cnt; ++i) { + --ghash_precompute; + __m128i X = _mm_loadu_si128(gdatabuf + i); + X = _mm_shuffle_epi8(X, bswap8); + __m128i t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x00); + lo = _mm_xor_si128(lo, t); + t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x11); + hi = _mm_xor_si128(hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(ghash_precompute->r, t, 0x00); + mid = _mm_xor_si128(mid, t); + } + /* finish multiplication */ mid = _mm_xor_si128(mid, hi); mid = _mm_xor_si128(mid, lo); From bdabc76de9cea663f861e32cec8b314eafcfe101 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 10 May 2020 05:25:59 +0900 Subject: [PATCH 23/60] parameterize the benchmark --- t/fusionbench.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/t/fusionbench.c b/t/fusionbench.c index 1ee693b83..b79096a73 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -2,31 +2,54 @@ #include #include #include +#include #include "picotls/fusion.h" int main(int argc, char **argv) { static const uint8_t key[16] = {}, iv[12] = {}, aad[13] = {}; size_t textlen = 16384; - ptls_fusion_aesecb_context_t *suppkey; + ptls_fusion_aesecb_context_t *suppkey = NULL; uint8_t suppvec[16] = {}; + int ch, count = 1000000; - if (argc >= 2 && sscanf(argv[1], "%zu", &textlen) != 1) { - fprintf(stderr, "failed to obtain text length from argument\n"); - return 1; - } - if (argc >= 3 && strcmp(argv[2], "1") == 0) { - static const uint8_t k[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - suppkey = malloc(sizeof(*suppkey)); - ptls_fusion_aesecb_init(suppkey, k); + while ((ch = getopt(argc, argv, "b:n:sh")) != -1) { + switch (ch) { + case 'b': + if (sscanf(optarg, "%zu", &textlen) != 1) { + fprintf(stderr, "failed to parse the number of bytes given by `-b`\n"); + exit(1); + } + break; + case 'n': + if (sscanf(optarg, "%d", &count) != 1) { + fprintf(stderr, "failed to parse the number given by `-n`\n"); + exit(1); + } + break; + case 's': { + static const uint8_t k[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + suppkey = malloc(sizeof(*suppkey)); + ptls_fusion_aesecb_init(suppkey, k); + } break; + default: + printf("Usage: %s -b -s\n" + "Options:\n" + " -b specifies the size of the AEAD payload\n" + " -n number of iterations\n" + " -s if set, runs the benchmark with supplemental vector\n", argv[0]); + return 0; + } } + argc -= optind; + argv += optind; uint8_t *text = malloc(textlen + 16); memset(text, 0, textlen + 16); ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); - for (int i = 0; i < 1000000; ++i) + for (int i = 0; i < count; ++i) ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, suppkey, suppvec); for (int i = 0; i < 16; ++i) From 8b4dfee052eeabeead149d6fcd1b5db71974626b Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 10 May 2020 12:09:03 +0900 Subject: [PATCH 24/60] decryption --- include/picotls/fusion.h | 15 ++ lib/fusion.c | 292 +++++++++++++++++++++++++++++++++------ t/fusion.c | 28 +++- 3 files changed, 285 insertions(+), 50 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 415993434..3150f332d 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -64,6 +64,21 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); */ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *dst, const void *src, size_t srclen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); +/** + * Decrypts an AEAD block, an in parallel, optionally encrypts one block using AES-ECB. + * @param iv initialization vector of 12 bytes + * @param aad AAD + * @param aadlen size of AAD + * @param dst output buffer + * @param src payload to be encrypted + * @param srclen size of the payload to be decrypted + * @param tag the AEAD tag being received from peer + * @param suppkey (optional) points to an AES-ECB context used for generating suppvec + * @param suppvec (optional) vector to be encrypted using suppkey + */ +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *dst, + const void *src, size_t srclen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, + void *suppvec); extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; diff --git a/lib/fusion.c b/lib/fusion.c index 5364781a4..dfec083b6 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -125,6 +125,46 @@ static __m128i gfmul(__m128i x, __m128i y) return _mm_xor_si128(hi, lo); } +struct ptls_fusion_gfmul_state { + __m128i hi, lo, mid; +}; + +static inline void gfmul_onestep(struct ptls_fusion_gfmul_state *gstate, __m128i X, + struct ptls_fusion_aesgcm_ghash_precompute *precompute) +{ + X = _mm_shuffle_epi8(X, bswap8); + __m128i t = _mm_clmulepi64_si128(precompute->H, X, 0x00); + gstate->lo = _mm_xor_si128(gstate->lo, t); + t = _mm_clmulepi64_si128(precompute->H, X, 0x11); + gstate->hi = _mm_xor_si128(gstate->hi, t); + t = _mm_shuffle_epi32(X, 78); + t = _mm_xor_si128(t, X); + t = _mm_clmulepi64_si128(precompute->r, t, 0x00); + gstate->mid = _mm_xor_si128(gstate->mid, t); +} + +static inline __m128i gfmul_final(struct ptls_fusion_gfmul_state *gstate, __m128i ek0) +{ + /* finish multiplication */ + gstate->mid = _mm_xor_si128(gstate->mid, gstate->hi); + gstate->mid = _mm_xor_si128(gstate->mid, gstate->lo); + gstate->lo = _mm_xor_si128(gstate->lo, _mm_slli_si128(gstate->mid, 8)); + gstate->hi = _mm_xor_si128(gstate->hi, _mm_srli_si128(gstate->mid, 8)); + + /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ + __m128i r = _mm_clmulepi64_si128(gstate->lo, poly, 0x10); + gstate->lo = _mm_shuffle_epi32(gstate->lo, 78); + gstate->lo = _mm_xor_si128(gstate->lo, r); + r = _mm_clmulepi64_si128(gstate->lo, poly, 0x10); + gstate->lo = _mm_shuffle_epi32(gstate->lo, 78); + gstate->lo = _mm_xor_si128(gstate->lo, r); + __m128i tag = _mm_xor_si128(gstate->hi, gstate->lo); + tag = _mm_shuffle_epi8(tag, bswap8); + tag = _mm_xor_si128(tag, ek0); + + return tag; +} + static inline __m128i loadn(const void *_p, size_t l) { const uint8_t *p = _p; @@ -211,7 +251,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i __m128i ctr, ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); const __m128i *bits4keys = ctx->ecb.keys; /* is changed to suppkey->keys when calcurating suppout */ - __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128(), gdatabuf[6]; + struct ptls_fusion_gfmul_state gstate = {}; + __m128i gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8), bswap64); const __m128i *gdata; // points to the elements fed into GHASH @@ -341,24 +382,10 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i /* run AES and multiplication in parallel */ for (size_t i = 2; i <= 7; ++i) { AESECB6_UPDATE(i); - - --ghash_precompute; - __m128i X = _mm_loadu_si128(gdata++); - X = _mm_shuffle_epi8(X, bswap8); - __m128i t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x00); - lo = _mm_xor_si128(lo, t); - t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x11); - hi = _mm_xor_si128(hi, t); - t = _mm_shuffle_epi32(X, 78); - t = _mm_xor_si128(t, X); - t = _mm_clmulepi64_si128(ghash_precompute->r, t, 0x00); - mid = _mm_xor_si128(mid, t); + gfmul_onestep(&gstate, *gdata++, --ghash_precompute); } - AESECB6_UPDATE(8); AESECB6_UPDATE(9); - - /* finish bit stream generation */ AESECB6_FINAL(); } @@ -378,37 +405,214 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); _mm_storeu_si128(suppvec, bits4); } - for (size_t i = 0; i < gdata_cnt; ++i) { - --ghash_precompute; - __m128i X = _mm_loadu_si128(gdatabuf + i); - X = _mm_shuffle_epi8(X, bswap8); - __m128i t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x00); - lo = _mm_xor_si128(lo, t); - t = _mm_clmulepi64_si128(ghash_precompute->H, X, 0x11); - hi = _mm_xor_si128(hi, t); - t = _mm_shuffle_epi32(X, 78); - t = _mm_xor_si128(t, X); - t = _mm_clmulepi64_si128(ghash_precompute->r, t, 0x00); - mid = _mm_xor_si128(mid, t); + for (size_t i = 0; i < gdata_cnt; ++i) + gfmul_onestep(&gstate, gdatabuf[i], --ghash_precompute); + + _mm_storeu_si128(dst, gfmul_final(&gstate, ek0)); + +#undef AESECB6_INIT +#undef AESECB6_UPDATE +#undef AESECB6_FINAL +#undef STATE_EK0_BEEN_FOUND +#undef STATE_EK0_READY +#undef STATE_SUPP_IN_PROCESS +} + +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, + const void *_src, size_t _srclen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, + void *suppvec) +{ + __m128i ctr, ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), + bits3 = _mm_setzero_si128(), bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); + const __m128i *bits1keys; + struct ptls_fusion_gfmul_state gstate = {}; + __m128i gdatabuf[6]; + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)_srclen * 8, 0, (int)aadlen * 8), bswap64); + struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (_srclen + 15) / 16 + 1; + + const __m128i *gdata; // points to the elements fed into GHASH + size_t gdata_cnt; + + const __m128i *src_ghash = _src, *src_aes = _src, *aad = _aad; + __m128i *dst = _dst; + size_t nondata_aes_cnt = 0, src_ghashlen = _srclen, src_aeslen = _srclen; + + /* build counter */ + ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); + ctr = _mm_shuffle_epi8(ctr, bswap8); + + /* schedule ek0 and suppkey */ + ctr = _mm_add_epi64(ctr, one64); + bits0 = _mm_shuffle_epi8(ctr, bswap64); + ++nondata_aes_cnt; + if (suppkey != NULL) { + bits1keys = suppkey->keys; + bits1 = _mm_loadu_si128(suppvec); + ++nondata_aes_cnt; + } else { + bits1keys = ctx->ecb.keys; } - /* finish multiplication */ - mid = _mm_xor_si128(mid, hi); - mid = _mm_xor_si128(mid, lo); - lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); - hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); +#define STATE_IS_FIRST_RUN 0x1 +#define STATE_GHASH_HAS_MORE 0x2 + int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE; - /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ - __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - r = _mm_clmulepi64_si128(lo, poly, 0x10); - lo = _mm_shuffle_epi32(lo, 78); - lo = _mm_xor_si128(lo, r); - __m128i tag = _mm_xor_si128(hi, lo); - tag = _mm_shuffle_epi8(tag, bswap8); - tag = _mm_xor_si128(tag, ek0); - _mm_storeu_si128(dst, tag); + /* the main loop */ + while (1) { + + /* setup gdata */ + if (PTLS_UNLIKELY(aadlen != 0)) { + gdata = gdatabuf; + gdata_cnt = 0; + while (gdata_cnt < 6) { + if (aadlen < 16) { + if (aadlen != 0) { + gdatabuf[gdata_cnt++] = loadn(aad, aadlen); + aadlen = 0; + ++nondata_aes_cnt; + } + goto GdataFillSrc; + } + gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); + aadlen -= 16; + ++nondata_aes_cnt; + } + } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) { + gdata = src_ghash; + gdata_cnt = 6; + src_ghash += 6; + src_ghashlen -= 6 * 16; + } else { + gdata = gdatabuf; + gdata_cnt = 0; + GdataFillSrc: + while (gdata_cnt < 6) { + if (src_ghashlen < 16) { + if (src_ghashlen != 0) { + gdatabuf[gdata_cnt++] = loadn(src_ghash, src_ghashlen); + src_ghashlen = 0; + } + if (gdata_cnt < 6 && (state & STATE_IS_FIRST_RUN) == 0) { + gdatabuf[gdata_cnt++] = ek0; + state &= ~STATE_GHASH_HAS_MORE; + } + break; + } + gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++); + src_ghashlen -= 16; + } + } + + /* setup aes bits */ + if (PTLS_LIKELY(nondata_aes_cnt == 0)) + goto InitAllBits; + switch (nondata_aes_cnt) { +#define INIT_BITS(n) \ + case n: \ + ctr = _mm_add_epi64(ctr, one64); \ + bits##n = _mm_shuffle_epi8(ctr, bswap64); + InitAllBits: + INIT_BITS(0); + INIT_BITS(1); + INIT_BITS(2); + INIT_BITS(3); + INIT_BITS(4); + INIT_BITS(5); +#undef INIT_BITS + } + + { /* run aes and ghash */ +#define AESECB6_UPDATE(i) \ + do { \ + __m128i k = ctx->ecb.keys[i]; \ + bits0 = _mm_aesenc_si128(bits0, k); \ + bits1 = _mm_aesenc_si128(bits1, bits1keys[i]); \ + bits2 = _mm_aesenc_si128(bits2, k); \ + bits3 = _mm_aesenc_si128(bits3, k); \ + bits4 = _mm_aesenc_si128(bits4, k); \ + bits5 = _mm_aesenc_si128(bits5, k); \ + } while (0) + + size_t aesi; + for (aesi = 1; aesi <= gdata_cnt; ++aesi) { + AESECB6_UPDATE(aesi); + gfmul_onestep(&gstate, *gdata++, --ghash_precompute); + } + for (; aesi <= 9; ++aesi) + AESECB6_UPDATE(aesi); + __m128i k = ctx->ecb.keys[aesi]; + bits0 = _mm_aesenclast_si128(bits0, k); + bits1 = _mm_aesenclast_si128(bits1, bits1keys[aesi]); + bits2 = _mm_aesenclast_si128(bits2, k); + bits3 = _mm_aesenclast_si128(bits3, k); + bits4 = _mm_aesenclast_si128(bits4, k); + bits5 = _mm_aesenclast_si128(bits5, k); + +#undef AESECB6_UPDATE + } + + /* apply aes bits */ + if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) { +#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i)) + APPLY(0); + APPLY(1); + APPLY(2); + APPLY(3); + APPLY(4); + APPLY(5); +#undef APPLY + dst += 6; + src_aes += 6; + src_aeslen -= 6 * 16; + } else { + if ((state & STATE_IS_FIRST_RUN) != 0) { + ek0 = bits0; + if (suppkey != NULL) { + _mm_store_si128(suppvec, bits1); + bits1keys = ctx->ecb.keys; + } + state &= ~STATE_IS_FIRST_RUN; + } + switch (nondata_aes_cnt) { +#define APPLY(i) \ + case i: \ + if (PTLS_LIKELY(src_aeslen >= 16)) { \ + _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \ + src_aeslen -= 16; \ + } else { \ + if (src_aeslen != 0) { \ + storen(dst, src_aeslen, _mm_xor_si128(loadn(src_aes, src_aeslen), bits##i)); \ + src_aeslen = 0; \ + } \ + goto Finish; \ + } + APPLY(0); + APPLY(1); + APPLY(2); + APPLY(3); + APPLY(4); + APPLY(5); +#undef APPLY + } + nondata_aes_cnt = 0; + } + } + +Finish: + assert((state & STATE_IS_FIRST_RUN) == 0); + + /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */ + if ((state & STATE_GHASH_HAS_MORE) != 0) { + assert(ghash_precompute - 1 == ctx->ghash); + gfmul_onestep(&gstate, ac, --ghash_precompute); + } + + __m128i calctag = gfmul_final(&gstate, ek0); + + return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff; + +#undef STATE_IS_FIRST_RUN +#undef STATE_GHASH_HAS_MORE } static __m128i expand_key(__m128i key, __m128i t) diff --git a/t/fusion.c b/t/fusion.c index b6d0a8f11..7bebe94de 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -51,28 +51,41 @@ int main(int argc, char **argv) static const uint8_t zero[16384] = {}, one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; { + static const uint8_t expected[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, + 0xb9, 0x71, 0xb2, 0xfe, 0x78, 0x97, 0x3f, 0xbc, 0xa6, 0x54, 0x77, + 0xbf, 0x47, 0x85, 0xb0, 0xd5, 0x61, 0xf7, 0xe3, 0xfd, 0x6c}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); - uint8_t encrypted[32]; + uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; + ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16, NULL, NULL); - ok(strcmp(tostr(encrypted, sizeof(encrypted)), "0388dace60b6a392f328c2b971b2fe78973fbca65477bf4785b0d561f7e3fd6c") == 0); + ok(memcmp(expected, encrypted, sizeof(expected)) == 0); + + memset(decrypted, 0x55, sizeof(decrypted)); + ok(ptls_fusion_aesgcm_decrypt(ctx, zero, "hello", 5, decrypted, expected, sizeof(expected) - 16, + expected + sizeof(expected) - 16, NULL, NULL)); + ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); + ptls_fusion_aesgcm_destroy(ctx); } { /* test capacity */ + static const uint8_t expected[17] = {0x5b, 0x27, 0x21, 0x5e, 0xd8, 0x1a, 0x70, 0x2e, 0x39, + 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); - uint8_t encrypted[17]; + uint8_t encrypted[17], decrypted[1] = {0x55}; ptls_fusion_aesgcm_encrypt(ctx, zero, "a", 1, encrypted, "X", 1, NULL, NULL); - ok(strcmp(tostr(encrypted + 1, 16), "27215ed81a702e3941c80577d52fcb57") == 0); + ok(memcmp(expected, encrypted, 17) == 0); + ok(ptls_fusion_aesgcm_decrypt(ctx, zero, "a", 1, decrypted, expected, 1, expected + 1, NULL, NULL)); + ok('X' == decrypted[0]); ptls_fusion_aesgcm_destroy(ctx); } - { ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(zero, sizeof(zero)); ptls_fusion_aesecb_context_t *ecb = NULL; for (int i = 0; i < 2; ++i) { - uint8_t encrypted[sizeof(zero) + 16], ecbvec[16]; + uint8_t encrypted[sizeof(zero) + 16], ecbvec[16], decrypted[sizeof(zero)]; #define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ do { \ memset(ecbvec, 0, sizeof(ecbvec)); \ @@ -83,6 +96,9 @@ int main(int argc, char **argv) } else { \ ok(strcmp(tostr(ecbvec, sizeof(ecbvec)), "b6aeaffa752dc08b51639731761aed00") == 0); \ } \ + memset(decrypted, 0x55, sizeof(decrypted)); \ + ok(!ptls_fusion_aesgcm_decrypt(aead, iv, aad, aadlen, decrypted, encrypted, ptlen, zero, NULL, NULL)); \ + ok(memcmp(decrypted, zero, ptlen) == 0); \ } while (0) DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf"); From a891e3179c68e6fcb498c17721e3fc810815b7e2 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 11 May 2020 06:00:07 +0900 Subject: [PATCH 25/60] add option to benchmark decryption speed --- t/fusionbench.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/t/fusionbench.c b/t/fusionbench.c index b79096a73..5ebad53e4 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -11,9 +11,9 @@ int main(int argc, char **argv) size_t textlen = 16384; ptls_fusion_aesecb_context_t *suppkey = NULL; uint8_t suppvec[16] = {}; - int ch, count = 1000000; + int ch, decrypt = 0, count = 1000000; - while ((ch = getopt(argc, argv, "b:n:sh")) != -1) { + while ((ch = getopt(argc, argv, "b:dn:sh")) != -1) { switch (ch) { case 'b': if (sscanf(optarg, "%zu", &textlen) != 1) { @@ -21,6 +21,9 @@ int main(int argc, char **argv) exit(1); } break; + case 'd': + decrypt = 1; + break; case 'n': if (sscanf(optarg, "%d", &count) != 1) { fprintf(stderr, "failed to parse the number given by `-n`\n"); @@ -36,8 +39,10 @@ int main(int argc, char **argv) printf("Usage: %s -b -s\n" "Options:\n" " -b specifies the size of the AEAD payload\n" + " -d test decryption\n" " -n number of iterations\n" - " -s if set, runs the benchmark with supplemental vector\n", argv[0]); + " -s if set, runs the benchmark with supplemental vector\n", + argv[0]); return 0; } } @@ -49,9 +54,17 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); - for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, suppkey, suppvec); + if (!decrypt) { + for (int i = 0; i < count; ++i) + ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, suppkey, suppvec); + } else { + for (int i = 0; i < count; ++i) + ptls_fusion_aesgcm_decrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, text + textlen, suppkey, suppvec); + } + for (int i = 0; i < 16; ++i) + printf("%02x", text[i]); + printf("\n"); for (int i = 0; i < 16; ++i) printf("%02x", text[textlen + i]); printf("\n"); From f94669fa9c4a719f19d5eb8887255faaf52df259 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 11 May 2020 06:00:18 +0900 Subject: [PATCH 26/60] add fusionbench to xcodeproj --- picotls.xcodeproj/project.pbxproj | 130 +++++++++++++++++++----------- 1 file changed, 84 insertions(+), 46 deletions(-) diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 19f26df3f..03bb2238d 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -112,34 +112,16 @@ E99B75E41F5CE64E00CF503E /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; E99B75E51F5CE64E00CF503E /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; - E9B43DC324619D5100824E51 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900241DC8D37500FB4085 /* aes.c */; }; - E9B43DC424619D5100824E51 /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; - E9B43DC524619D5100824E51 /* ffx.c in Sources */ = {isa = PBXBuildFile; fileRef = E97577022212405D00D1EF74 /* ffx.c */; }; - E9B43DC624619D5100824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E91D9B7C13005B2C60 /* picotls.c */; }; - E9B43DC724619D5100824E51 /* uECC.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900BC1DC96A3500FB4085 /* uECC.c */; }; E9B43DC824619D5100824E51 /* picotest.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E31D9B4021005B2C60 /* picotest.c */; }; - E9B43DC924619D5100824E51 /* sha256.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059005F1DC8DE4400FB4085 /* sha256.c */; }; - E9B43DCA24619D5100824E51 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE422E34B340018D260 /* chacha20.c */; }; - E9B43DCB24619D5100824E51 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; - E9B43DCD24619D5100824E51 /* asn1.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DE1F5CDDB500CF503E /* asn1.c */; }; - E9B43DCE24619D5100824E51 /* sha512.c in Sources */ = {isa = PBXBuildFile; fileRef = E9E865E9203BD45600E2FFCD /* sha512.c */; }; - E9B43DCF24619D5100824E51 /* aes256.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE022E34B340018D260 /* aes256.c */; }; - E9B43DD024619D5100824E51 /* random.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BF922E34C110018D260 /* random.c */; }; - E9B43DD124619D5100824E51 /* aes128.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE222E34B340018D260 /* aes128.c */; }; - E9B43DD224619D5100824E51 /* drbg.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900511DC8D79300FB4085 /* drbg.c */; }; - E9B43DD324619D5100824E51 /* x25519.c in Sources */ = {isa = PBXBuildFile; fileRef = E9F20BE122E34B340018D260 /* x25519.c */; }; - E9B43DD424619D5100824E51 /* poly1305.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76D61EF3C1C200EB7A09 /* poly1305.c */; }; - E9B43DD524619D5100824E51 /* hmac.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900651DC8DFD300FB4085 /* hmac.c */; }; - E9B43DD624619D5100824E51 /* chash.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059006B1DC8E00400FB4085 /* chash.c */; }; - E9B43DD724619D5100824E51 /* gf128.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900AA1DC941D700FB4085 /* gf128.c */; }; - E9B43DD824619D5100824E51 /* curve25519.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900391DC8D46A00FB4085 /* curve25519.c */; }; - E9B43DD924619D5100824E51 /* gcm.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900A91DC941D700FB4085 /* gcm.c */; }; - E9B43DDA24619D5100824E51 /* modes.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900AF1DC9438200FB4085 /* modes.c */; }; - E9B43DDB24619D5100824E51 /* blockwise.c in Sources */ = {isa = PBXBuildFile; fileRef = 105900291DC8D39800FB4085 /* blockwise.c */; }; E9B43DE324619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; E9B43DE724652D2000824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; + E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43E0B24689E8900824E51 /* fusionbench.c */; }; + E9B43E0D24689EDA00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; + E9B43E0E24689F9500824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; + E9B43E0F24689FAC00824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530BF1D998641005B2C60 /* picotls.c */; }; + E9B43E1024689FE700824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530BF1D998641005B2C60 /* picotls.c */; }; E9BC76CF1EF3A35E00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D21EF3A36A00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D41EF3A37200EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; @@ -242,6 +224,15 @@ ); runOnlyForDeploymentPostprocessing = 1; }; + E9B43E0624689E5900824E51 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + ); + runOnlyForDeploymentPostprocessing = 1; + }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ @@ -310,6 +301,8 @@ E9B43DE124619D5100824E51 /* test-fusion */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "test-fusion"; sourceTree = BUILT_PRODUCTS_DIR; }; E9B43DE224619D7E00824E51 /* fusion.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = fusion.c; sourceTree = ""; }; E9B43DE62461A06800824E51 /* fusion.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = fusion.h; sourceTree = ""; }; + E9B43E0A24689E5900824E51 /* fusionbench */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = fusionbench; sourceTree = BUILT_PRODUCTS_DIR; }; + E9B43E0B24689E8900824E51 /* fusionbench.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fusionbench.c; sourceTree = ""; }; E9BC76C61EF3A2F700EB7A09 /* chacha20.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = chacha20.c; path = src/chacha20.c; sourceTree = ""; }; E9BC76CC1EF3A31000EB7A09 /* salsa20.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = salsa20.h; path = src/salsa20.h; sourceTree = ""; }; E9BC76D61EF3C1C200EB7A09 /* poly1305.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = poly1305.c; path = src/poly1305.c; sourceTree = ""; }; @@ -388,6 +381,13 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + E9B43E0524689E5900824E51 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ @@ -465,6 +465,7 @@ 10EACB171DCEAF0F00CA0341 /* libpicotls-minicrypto.a */, E992F7A920E99A7C0008154D /* picotls-esni */, E9B43DE124619D5100824E51 /* test-fusion */, + E9B43E0A24689E5900824E51 /* fusionbench */, ); name = Products; sourceTree = ""; @@ -502,6 +503,7 @@ 106530FE1DAD8A3C005B2C60 /* cli.c */, E97577072213148800D1EF74 /* e2e.t */, E9B43DE224619D7E00824E51 /* fusion.c */, + E9B43E0B24689E8900824E51 /* fusionbench.c */, 106530E91D9B7C13005B2C60 /* picotls.c */, 1059003D1DC8D4E300FB4085 /* minicrypto.c */, 106530C51D9B1A98005B2C60 /* openssl.c */, @@ -795,6 +797,23 @@ productReference = E9B43DE124619D5100824E51 /* test-fusion */; productType = "com.apple.product-type.tool"; }; + E9B43DE824689E5900824E51 /* fusionbench */ = { + isa = PBXNativeTarget; + buildConfigurationList = E9B43E0724689E5900824E51 /* Build configuration list for PBXNativeTarget "fusionbench" */; + buildPhases = ( + E9B43DE924689E5900824E51 /* Sources */, + E9B43E0524689E5900824E51 /* Frameworks */, + E9B43E0624689E5900824E51 /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = fusionbench; + productName = "test-crypto-openssl"; + productReference = E9B43E0A24689E5900824E51 /* fusionbench */; + productType = "com.apple.product-type.tool"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ @@ -833,6 +852,7 @@ 105900411DC8D57000FB4085 /* test-minicrypto */, E992F79B20E99A7C0008154D /* picotls-esni */, E9B43DC024619D5100824E51 /* test-fusion */, + E9B43DE824689E5900824E51 /* fusionbench */, ); }; /* End PBXProject section */ @@ -978,32 +998,21 @@ buildActionMask = 2147483647; files = ( E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */, - E9B43DC324619D5100824E51 /* aes.c in Sources */, - E9B43DC424619D5100824E51 /* pembase64.c in Sources */, + E9B43E1024689FE700824E51 /* picotls.c in Sources */, E9B43DE724652D2000824E51 /* fusion.c in Sources */, E9B43DE424619D7E00824E51 /* fusion.c in Sources */, - E9B43DC524619D5100824E51 /* ffx.c in Sources */, - E9B43DC624619D5100824E51 /* picotls.c in Sources */, - E9B43DC724619D5100824E51 /* uECC.c in Sources */, E9B43DC824619D5100824E51 /* picotest.c in Sources */, - E9B43DC924619D5100824E51 /* sha256.c in Sources */, - E9B43DCA24619D5100824E51 /* chacha20.c in Sources */, - E9B43DCB24619D5100824E51 /* chacha20.c in Sources */, - E9B43DCD24619D5100824E51 /* asn1.c in Sources */, - E9B43DCE24619D5100824E51 /* sha512.c in Sources */, - E9B43DCF24619D5100824E51 /* aes256.c in Sources */, - E9B43DD024619D5100824E51 /* random.c in Sources */, - E9B43DD124619D5100824E51 /* aes128.c in Sources */, - E9B43DD224619D5100824E51 /* drbg.c in Sources */, - E9B43DD324619D5100824E51 /* x25519.c in Sources */, - E9B43DD424619D5100824E51 /* poly1305.c in Sources */, - E9B43DD524619D5100824E51 /* hmac.c in Sources */, - E9B43DD624619D5100824E51 /* chash.c in Sources */, - E9B43DD724619D5100824E51 /* gf128.c in Sources */, - E9B43DD824619D5100824E51 /* curve25519.c in Sources */, - E9B43DD924619D5100824E51 /* gcm.c in Sources */, - E9B43DDA24619D5100824E51 /* modes.c in Sources */, - E9B43DDB24619D5100824E51 /* blockwise.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + E9B43DE924689E5900824E51 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + E9B43E0E24689F9500824E51 /* picotls-probes.d in Sources */, + E9B43E0F24689FAC00824E51 /* picotls.c in Sources */, + E9B43E0D24689EDA00824E51 /* fusion.c in Sources */, + E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1340,6 +1349,26 @@ }; name = Release; }; + E9B43E0824689E5900824E51 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; + OTHER_CFLAGS = "-march=native"; + OTHER_LDFLAGS = ""; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Debug; + }; + E9B43E0924689E5900824E51 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; + OTHER_CFLAGS = "-march=native"; + OTHER_LDFLAGS = ""; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ @@ -1424,6 +1453,15 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; + E9B43E0724689E5900824E51 /* Build configuration list for PBXNativeTarget "fusionbench" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + E9B43E0824689E5900824E51 /* Debug */, + E9B43E0924689E5900824E51 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ }; rootObject = 106530AA1D9985E0005B2C60 /* Project object */; From ae95e4c33a9eeadb0dd7ab2fe14357161f5d5e5d Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 11 May 2020 06:27:27 +0900 Subject: [PATCH 27/60] be explicit about the origin --- lib/fusion.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/fusion.c b/lib/fusion.c index dfec083b6..9f75a0f20 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -63,7 +63,8 @@ static const uint8_t bswap64_[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, static const uint8_t one64_[16] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0, 1}; #define one64 (*(__m128i *)one64_) -// This function is covered by the Apache License and the MIT License. See Above. +/* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl + * at commit 33388b4. */ static __m128i transformH(__m128i H) { // # <<1 twist From 9f2fb301a67e239232d85b35fdd57cc740423213 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 11 May 2020 13:13:26 +0900 Subject: [PATCH 28/60] CTR mode --- include/picotls/fusion.h | 4 +-- lib/fusion.c | 78 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 3150f332d..60b16a4c3 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -30,10 +30,10 @@ extern "C" { #include #include "../picotls.h" -#define PTLS_FUSION_AESGCM_ROUNDS 10 /* TODO support AES256 */ +#define PTLS_FUSION_AES_ROUNDS 10 /* TODO support AES256 */ typedef struct ptls_fusion_aesecb_context { - __m128i keys[PTLS_FUSION_AESGCM_ROUNDS + 1]; + __m128i keys[PTLS_FUSION_AES_ROUNDS + 1]; } ptls_fusion_aesecb_context_t; typedef struct ptls_fusion_aesgcm_context ptls_fusion_aesgcm_context_t; diff --git a/lib/fusion.c b/lib/fusion.c index 9f75a0f20..9e2a407d8 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -625,6 +625,18 @@ static __m128i expand_key(__m128i key, __m128i t) return _mm_xor_si128(key, t); } +static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v) +{ + size_t i; + + v = _mm_xor_si128(v, ctx->keys[0]); + for (i = 1; i < PTLS_FUSION_AES_ROUNDS; ++i) + v = _mm_aesenc_si128(v, ctx->keys[i]); + v = _mm_aesenclast_si128(v, ctx->keys[i]); + + return v; +} + void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key) { size_t i = 0; @@ -665,10 +677,7 @@ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t ptls_fusion_aesecb_init(&ctx->ecb, key); ctx->ghash_cnt = ghash_cnt; - ctx->ghash[0].H = ctx->ecb.keys[0]; - for (size_t i = 1; i < PTLS_FUSION_AESGCM_ROUNDS; ++i) - ctx->ghash[0].H = _mm_aesenc_si128(ctx->ghash[0].H, ctx->ecb.keys[i]); - ctx->ghash[0].H = _mm_aesenclast_si128(ctx->ghash[0].H, ctx->ecb.keys[PTLS_FUSION_AESGCM_ROUNDS]); + ctx->ghash[0].H = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128()); ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); ctx->ghash[0].H = transformH(ctx->ghash[0].H); @@ -691,6 +700,55 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx) free(ctx); } +struct ctr_context { + ptls_cipher_context_t super; + ptls_fusion_aesecb_context_t fusion; + __m128i bits; + uint8_t is_ready; +}; + +static void ctr_dispose(ptls_cipher_context_t *_ctx) +{ + struct ctr_context *ctx = (struct ctr_context *)_ctx; + ptls_fusion_aesecb_dispose(&ctx->fusion); + _mm_storeu_si128(&ctx->bits, _mm_setzero_si128()); +} + +static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv) +{ + struct ctr_context *ctx = (struct ctr_context *)_ctx; + _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv))); + ctx->is_ready = 1; +} + +static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len) +{ + struct ctr_context *ctx = (struct ctr_context *)_ctx; + + assert((ctx->is_ready && len <= 16) || + !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes"); + ctx->is_ready = 0; + + if (len < 16) { + storen(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn(input, len))); + } else { + _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input))); + } +} + +static int aes128ctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key) +{ + struct ctr_context *ctx = (struct ctr_context *)_ctx; + + ctx->super.do_dispose = ctr_dispose; + ctx->super.do_init = ctr_init; + ctx->super.do_transform = ctr_transform; + ptls_fusion_aesecb_init(&ctx->fusion, key); + ctx->is_ready = 0; + + return 0; +} + struct aesgcm_context { ptls_aead_context_t super; ptls_fusion_aesgcm_context_t *aesgcm; @@ -727,7 +785,7 @@ static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const vo return SIZE_MAX; } -static int aes128gcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key) +static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; @@ -750,11 +808,17 @@ static int aes128gcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const v return 0; } +ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR", + PTLS_AES128_KEY_SIZE, + 1, // block size + PTLS_AES_IV_SIZE, + sizeof(struct ctr_context), + aes128ctr_setup}; ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", - NULL, // &ptls_fusion_aes128ctr, + &ptls_fusion_aes128ctr, NULL, // &ptls_fusion_aes128ecb, PTLS_AES128_KEY_SIZE, PTLS_AESGCM_IV_SIZE, PTLS_AESGCM_TAG_SIZE, sizeof(struct aesgcm_context), - aes128gcm_setup_crypto}; + aes128gcm_setup}; From 94feca2f137bd5c4787aa30ce785df59a488792f Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 11 May 2020 16:34:44 +0900 Subject: [PATCH 29/60] expose fusion to the picotls API --- include/picotls.h | 58 ++++++++++++++++++++++--- include/picotls/fusion.h | 23 +++++----- lib/cifra/aes-common.h | 5 ++- lib/cifra/chacha20.c | 5 ++- lib/fusion.c | 92 ++++++++++++++++++++++++++-------------- lib/openssl.c | 5 ++- lib/picotls.c | 13 +++--- t/fusion.c | 13 +++--- t/fusionbench.c | 4 +- 9 files changed, 148 insertions(+), 70 deletions(-) diff --git a/include/picotls.h b/include/picotls.h index e04c2dcee..46c22148d 100644 --- a/include/picotls.h +++ b/include/picotls.h @@ -32,6 +32,7 @@ extern "C" { #include #include +#include #include #if __GNUC__ >= 3 @@ -310,8 +311,10 @@ typedef struct st_ptls_aead_context_t { void (*do_encrypt_init)(struct st_ptls_aead_context_t *ctx, const void *iv, const void *aad, size_t aadlen); size_t (*do_encrypt_update)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen); size_t (*do_encrypt_final)(struct st_ptls_aead_context_t *ctx, void *output); + void (*do_encrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); size_t (*do_decrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen); + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); } ptls_aead_context_t; /** @@ -1190,8 +1193,10 @@ void ptls_aead_free(ptls_aead_context_t *ctx); /** * */ -size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen); +static size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen); +void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, + size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); /** * initializes the internal state of the encryptor */ @@ -1212,6 +1217,8 @@ static size_t ptls_aead_encrypt_final(ptls_aead_context_t *ctx, void *output); */ static size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen); +static size_t ptls_aead_decrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); /** * Return the current read epoch. */ @@ -1239,10 +1246,19 @@ int ptls_client_handle_message(ptls_t *tls, ptls_buffer_t *sendbuf, size_t epoch size_t inlen, ptls_handshake_properties_t *properties); int ptls_server_handle_message(ptls_t *tls, ptls_buffer_t *sendbuf, size_t epoch_offsets[5], size_t in_epoch, const void *input, size_t inlen, ptls_handshake_properties_t *properties); +/** + * internal + */ +static void ptls_aead__encrypt_supp(ptls_cipher_context_t *ctx, void *vec); /** * internal */ void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq); +/** + * + */ +static void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); /** * internal */ @@ -1365,6 +1381,22 @@ inline void ptls_cipher_encrypt(ptls_cipher_context_t *ctx, void *output, const ctx->do_transform(ctx, output, input, len); } +inline void ptls_aead__encrypt_supp(ptls_cipher_context_t *ctx, void *vec) +{ + if (ctx != NULL) { + ptls_cipher_init(ctx, vec); + memset(vec, 0, ctx->algo->iv_size); + ptls_cipher_encrypt(ctx, vec, vec, ctx->algo->iv_size); + } +} + +inline size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen) +{ + ptls_aead_encrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL, NULL); + return inlen + ctx->algo->tag_size; +} + inline void ptls_aead_encrypt_init(ptls_aead_context_t *ctx, uint64_t seq, const void *aad, size_t aadlen) { uint8_t iv[PTLS_MAX_IV_SIZE]; @@ -1383,13 +1415,27 @@ inline size_t ptls_aead_encrypt_final(ptls_aead_context_t *ctx, void *output) return ctx->do_encrypt_final(ctx, output); } -inline size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, - const void *aad, size_t aadlen) +inline void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) +{ + ctx->do_encrypt_init(ctx, iv, aad, aadlen); + ctx->do_encrypt_update(ctx, output, input, inlen); + ctx->do_encrypt_final(ctx, (uint8_t *)output + inlen); +} + +inline size_t ptls_aead_decrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { uint8_t iv[PTLS_MAX_IV_SIZE]; ptls_aead__build_iv(ctx, iv, seq); - return ctx->do_decrypt(ctx, output, input, inlen, iv, aad, aadlen); + return ctx->do_decrypt(ctx, output, input, inlen, iv, aad, aadlen, suppkey, suppvec); +} + +inline size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen) +{ + return ptls_aead_decrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL, NULL); } #define ptls_define_hash(name, ctx_type, init_func, update_func, final_func) \ diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 60b16a4c3..dfa4fb5d5 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -53,31 +53,32 @@ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); /** * Encrypts an AEAD block, and in parallel, optionally encrypts one block using AES-ECB. + * @param ctx context + * @param output output buffer + * @param input payload to be encrypted + * @param inlen size of the payload to be encrypted * @param iv initialization vector of 12 bytes * @param aad AAD * @param aadlen size of AAD - * @param dst output buffer - * @param src payload to be encrypted - * @param srclen size of the payload to be encrypted * @param suppkey (optional) points to an AES-ECB context used for generating suppvec * @param suppvec (optional) vector to be encrypted using suppkey */ -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *dst, - const void *src, size_t srclen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); /** - * Decrypts an AEAD block, an in parallel, optionally encrypts one block using AES-ECB. + * Decrypts an AEAD block, an in parallel, optionally encrypts one block using AES-ECB. Returns if decryption was successful. * @param iv initialization vector of 12 bytes + * @param output output buffer + * @param src payload to be decrypted + * @param inlen size of the payload to be decrypted * @param aad AAD * @param aadlen size of AAD - * @param dst output buffer - * @param src payload to be encrypted - * @param srclen size of the payload to be decrypted * @param tag the AEAD tag being received from peer * @param suppkey (optional) points to an AES-ECB context used for generating suppvec * @param suppvec (optional) vector to be encrypted using suppkey */ -int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *dst, - const void *src, size_t srclen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; diff --git a/lib/cifra/aes-common.h b/lib/cifra/aes-common.h index 9823337ed..a0031987c 100644 --- a/lib/cifra/aes-common.h +++ b/lib/cifra/aes-common.h @@ -133,8 +133,10 @@ static inline size_t aesgcm_encrypt_final(ptls_aead_context_t *_ctx, void *outpu } static inline size_t aesgcm_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen) + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { + ptls_aead__encrypt_supp(suppkey, suppvec); + struct aesgcm_context_t *ctx = (struct aesgcm_context_t *)_ctx; if (inlen < PTLS_AESGCM_TAG_SIZE) @@ -157,6 +159,7 @@ static inline int aead_aesgcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc ctx->super.do_encrypt_init = aesgcm_encrypt_init; ctx->super.do_encrypt_update = aesgcm_encrypt_update; ctx->super.do_encrypt_final = aesgcm_encrypt_final; + ctx->super.do_encrypt = ptls_aead__do_encrypt; ctx->super.do_decrypt = NULL; } else { ctx->super.do_encrypt_init = NULL; diff --git a/lib/cifra/chacha20.c b/lib/cifra/chacha20.c index ab956fae3..3e624b5ba 100644 --- a/lib/cifra/chacha20.c +++ b/lib/cifra/chacha20.c @@ -150,12 +150,14 @@ static size_t chacha20poly1305_encrypt_final(ptls_aead_context_t *_ctx, void *ou } static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen) + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { struct chacha20poly1305_context_t *ctx = (struct chacha20poly1305_context_t *)_ctx; uint8_t tag[PTLS_CHACHA20POLY1305_TAG_SIZE]; size_t ret; + ptls_aead__encrypt_supp(suppkey, suppvec); + if (inlen < sizeof(tag)) return SIZE_MAX; @@ -187,6 +189,7 @@ static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *_ctx, int is_ ctx->super.do_encrypt_init = chacha20poly1305_init; ctx->super.do_encrypt_update = chacha20poly1305_encrypt_update; ctx->super.do_encrypt_final = chacha20poly1305_encrypt_final; + ctx->super.do_encrypt = ptls_aead__do_encrypt; ctx->super.do_decrypt = NULL; } else { ctx->super.do_encrypt_init = NULL; diff --git a/lib/fusion.c b/lib/fusion.c index 9e2a407d8..2886b3b66 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -166,6 +166,18 @@ static inline __m128i gfmul_final(struct ptls_fusion_gfmul_state *gstate, __m128 return tag; } +static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v) +{ + size_t i; + + v = _mm_xor_si128(v, ctx->keys[0]); + for (i = 1; i < PTLS_FUSION_AES_ROUNDS; ++i) + v = _mm_aesenc_si128(v, ctx->keys[i]); + v = _mm_aesenclast_si128(v, ctx->keys[i]); + + return v; +} + static inline __m128i loadn(const void *_p, size_t l) { const uint8_t *p = _p; @@ -186,8 +198,8 @@ static inline void storen(void *_p, size_t l, __m128i v) p[i] = buf[i]; } -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, - const void *_src, size_t srclen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec) +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *_aad, size_t aadlen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec) { /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ #define AESECB6_INIT() \ @@ -254,14 +266,15 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i const __m128i *bits4keys = ctx->ecb.keys; /* is changed to suppkey->keys when calcurating suppout */ struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; - __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8), bswap64); + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); const __m128i *gdata; // points to the elements fed into GHASH size_t gdata_cnt; // src and dst are updated after the chunk is processed - const __m128i *src = _src; - __m128i *dst = _dst; + const __m128i *src = input; + __m128i *dst = output; + size_t srclen = inlen; // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor) const __m128i *aad = _aad, *dst_ghash = dst; size_t dst_ghashlen = srclen; @@ -419,8 +432,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *i #undef STATE_SUPP_IN_PROCESS } -int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst, - const void *_src, size_t _srclen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *_aad, size_t aadlen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, void *suppvec) { __m128i ctr, ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), @@ -428,15 +441,15 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv const __m128i *bits1keys; struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; - __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)_srclen * 8, 0, (int)aadlen * 8), bswap64); - struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (_srclen + 15) / 16 + 1; + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); + struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1; const __m128i *gdata; // points to the elements fed into GHASH size_t gdata_cnt; - const __m128i *src_ghash = _src, *src_aes = _src, *aad = _aad; - __m128i *dst = _dst; - size_t nondata_aes_cnt = 0, src_ghashlen = _srclen, src_aeslen = _srclen; + const __m128i *src_ghash = input, *src_aes = input, *aad = _aad; + __m128i *dst = output; + size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen; /* build counter */ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); @@ -491,10 +504,11 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv if (src_ghashlen < 16) { if (src_ghashlen != 0) { gdatabuf[gdata_cnt++] = loadn(src_ghash, src_ghashlen); + src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen); src_ghashlen = 0; } - if (gdata_cnt < 6 && (state & STATE_IS_FIRST_RUN) == 0) { - gdatabuf[gdata_cnt++] = ek0; + if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) { + gdatabuf[gdata_cnt++] = ac; state &= ~STATE_GHASH_HAS_MORE; } break; @@ -577,14 +591,16 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv switch (nondata_aes_cnt) { #define APPLY(i) \ case i: \ - if (PTLS_LIKELY(src_aeslen >= 16)) { \ + if (PTLS_LIKELY(src_aeslen > 16)) { \ _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \ src_aeslen -= 16; \ } else { \ - if (src_aeslen != 0) { \ + if (src_aeslen == 16) { \ + _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits##i)); \ + } else if (src_aeslen != 0) { \ storen(dst, src_aeslen, _mm_xor_si128(loadn(src_aes, src_aeslen), bits##i)); \ - src_aeslen = 0; \ } \ + src_aeslen = 0; \ goto Finish; \ } APPLY(0); @@ -625,18 +641,6 @@ static __m128i expand_key(__m128i key, __m128i t) return _mm_xor_si128(key, t); } -static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v) -{ - size_t i; - - v = _mm_xor_si128(v, ctx->keys[0]); - for (i = 1; i < PTLS_FUSION_AES_ROUNDS; ++i) - v = _mm_aesenc_si128(v, ctx->keys[i]); - v = _mm_aesenclast_si128(v, ctx->keys[i]); - - return v; -} - void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key) { size_t i = 0; @@ -778,11 +782,32 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) return SIZE_MAX; } -static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen) +void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { - assert(!"FIXME"); - return SIZE_MAX; + ptls_fusion_aesgcm_context_t *aesgcm = ((struct aesgcm_context *)_ctx)->aesgcm; + ptls_fusion_aesecb_context_t *aesecb = suppkey != NULL ? &((struct ctr_context *)suppkey)->fusion : NULL; + + ptls_fusion_aesgcm_encrypt(aesgcm, output, input, inlen, iv, aad, aadlen, aesecb, suppvec); +} + +static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) +{ + ptls_fusion_aesgcm_context_t *aesgcm = ((struct aesgcm_context *)_ctx)->aesgcm; + struct ptls_fusion_aesecb_context *aesecb = suppkey != NULL ? &((struct ctr_context *)suppkey)->fusion : NULL; + + if (inlen < 16) { + if (suppkey != NULL) + _mm_storeu_si128(suppvec, aesecb_encrypt(aesecb, _mm_loadu_si128(suppvec))); + return SIZE_MAX; + } + + size_t enclen = inlen - 16; + if (!ptls_fusion_aesgcm_decrypt(aesgcm, output, input, enclen, iv, aad, aadlen, (const uint8_t *)input + enclen, aesecb, + suppvec)) + return SIZE_MAX; + return enclen; } static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key) @@ -794,6 +819,7 @@ static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *ke ctx->super.do_encrypt_init = aead_do_encrypt_init; ctx->super.do_encrypt_update = aead_do_encrypt_update; ctx->super.do_encrypt_final = aead_do_encrypt_final; + ctx->super.do_encrypt = aead_do_encrypt; ctx->super.do_decrypt = NULL; } else { ctx->super.do_encrypt_init = NULL; diff --git a/lib/openssl.c b/lib/openssl.c index e7ae6274b..863628e17 100644 --- a/lib/openssl.c +++ b/lib/openssl.c @@ -823,8 +823,10 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) } static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen) + const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { + ptls_aead__encrypt_supp(suppkey, suppvec); + struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; uint8_t *output = _output; size_t off = 0, tag_size = ctx->super.algo->tag_size; @@ -861,6 +863,7 @@ static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void * ctx->super.do_encrypt_init = aead_do_encrypt_init; ctx->super.do_encrypt_update = aead_do_encrypt_update; ctx->super.do_encrypt_final = aead_do_encrypt_final; + ctx->super.do_encrypt = ptls_aead__do_encrypt; ctx->super.do_decrypt = NULL; } else { ctx->super.do_encrypt_init = NULL; diff --git a/lib/picotls.c b/lib/picotls.c index 78fa0f15a..f0bd6a6b3 100644 --- a/lib/picotls.c +++ b/lib/picotls.c @@ -5134,16 +5134,13 @@ void ptls_aead_free(ptls_aead_context_t *ctx) free(ctx); } -size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen) +void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, + size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) { - size_t off = 0; - - ptls_aead_encrypt_init(ctx, seq, aad, aadlen); - off += ptls_aead_encrypt_update(ctx, ((uint8_t *)output) + off, input, inlen); - off += ptls_aead_encrypt_final(ctx, ((uint8_t *)output) + off); + uint8_t iv[PTLS_MAX_IV_SIZE]; - return off; + ptls_aead__build_iv(ctx, iv, seq); + ctx->do_encrypt(ctx, output, input, inlen, iv, aad, aadlen, suppkey, suppvec); } void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq) diff --git a/t/fusion.c b/t/fusion.c index 7bebe94de..c9587600f 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -57,12 +57,11 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; - ptls_fusion_aesgcm_encrypt(ctx, zero, "hello", 5, encrypted, zero, 16, NULL, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, zero, "hello", 5, NULL, NULL); ok(memcmp(expected, encrypted, sizeof(expected)) == 0); memset(decrypted, 0x55, sizeof(decrypted)); - ok(ptls_fusion_aesgcm_decrypt(ctx, zero, "hello", 5, decrypted, expected, sizeof(expected) - 16, - expected + sizeof(expected) - 16, NULL, NULL)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, zero, "hello", 5, expected + 16, NULL, NULL)); ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); ptls_fusion_aesgcm_destroy(ctx); @@ -73,9 +72,9 @@ int main(int argc, char **argv) 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); uint8_t encrypted[17], decrypted[1] = {0x55}; - ptls_fusion_aesgcm_encrypt(ctx, zero, "a", 1, encrypted, "X", 1, NULL, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, zero, "a", 1, NULL, NULL); ok(memcmp(expected, encrypted, 17) == 0); - ok(ptls_fusion_aesgcm_decrypt(ctx, zero, "a", 1, decrypted, expected, 1, expected + 1, NULL, NULL)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, zero, "a", 1, expected + 1, NULL, NULL)); ok('X' == decrypted[0]); ptls_fusion_aesgcm_destroy(ctx); } @@ -89,7 +88,7 @@ int main(int argc, char **argv) #define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ do { \ memset(ecbvec, 0, sizeof(ecbvec)); \ - ptls_fusion_aesgcm_encrypt(aead, iv, aad, aadlen, encrypted, zero, ptlen, ecb, &ecbvec); \ + ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, iv, aad, aadlen, ecb, &ecbvec); \ ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ if (i == 0) { \ ok(memcmp(ecbvec, zero, sizeof(ecbvec)) == 0); \ @@ -97,7 +96,7 @@ int main(int argc, char **argv) ok(strcmp(tostr(ecbvec, sizeof(ecbvec)), "b6aeaffa752dc08b51639731761aed00") == 0); \ } \ memset(decrypted, 0x55, sizeof(decrypted)); \ - ok(!ptls_fusion_aesgcm_decrypt(aead, iv, aad, aadlen, decrypted, encrypted, ptlen, zero, NULL, NULL)); \ + ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, ptlen, iv, aad, aadlen, encrypted + ptlen, NULL, NULL)); \ ok(memcmp(decrypted, zero, ptlen) == 0); \ } while (0) diff --git a/t/fusionbench.c b/t/fusionbench.c index 5ebad53e4..2b6711a1c 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -56,10 +56,10 @@ int main(int argc, char **argv) if (!decrypt) { for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_encrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, suppkey, suppvec); + ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), suppkey, suppvec); } else { for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_decrypt(ctx, iv, aad, sizeof(aad), text, text, textlen, text + textlen, suppkey, suppvec); + ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), suppkey, suppvec); } for (int i = 0; i < 16; ++i) From 4879386228368c9ab76c55834b57332228dc2c1d Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 12 May 2020 11:39:28 +0900 Subject: [PATCH 30/60] unaligned access --- lib/fusion.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 2886b3b66..0410842fb 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -396,7 +396,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, /* run AES and multiplication in parallel */ for (size_t i = 2; i <= 7; ++i) { AESECB6_UPDATE(i); - gfmul_onestep(&gstate, *gdata++, --ghash_precompute); + gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); } AESECB6_UPDATE(8); AESECB6_UPDATE(9); @@ -551,7 +551,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, size_t aesi; for (aesi = 1; aesi <= gdata_cnt; ++aesi) { AESECB6_UPDATE(aesi); - gfmul_onestep(&gstate, *gdata++, --ghash_precompute); + gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); } for (; aesi <= 9; ++aesi) AESECB6_UPDATE(aesi); From faedb810e4b5a3768a84d1f1b52244b50fd6b453 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 12 May 2020 11:39:58 +0900 Subject: [PATCH 31/60] remove unnecessary assert --- lib/fusion.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/fusion.c b/lib/fusion.c index 0410842fb..ced56fe61 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -828,7 +828,6 @@ static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *ke ctx->super.do_decrypt = aead_do_decrypt; } - assert(is_enc); ctx->aesgcm = ptls_fusion_aesgcm_create(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ return 0; From 66a95e5ecbe3dfcb31b8bb7d79f2dead915921c1 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 12 May 2020 12:57:37 +0900 Subject: [PATCH 32/60] apply XOR --- lib/fusion.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index ced56fe61..90b0bdcfb 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -457,7 +457,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, /* schedule ek0 and suppkey */ ctr = _mm_add_epi64(ctr, one64); - bits0 = _mm_shuffle_epi8(ctr, bswap64); + bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), ctx->ecb.keys[0]); ++nondata_aes_cnt; if (suppkey != NULL) { bits1keys = suppkey->keys; @@ -522,17 +522,17 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, if (PTLS_LIKELY(nondata_aes_cnt == 0)) goto InitAllBits; switch (nondata_aes_cnt) { -#define INIT_BITS(n) \ +#define INIT_BITS(n, keys) \ case n: \ ctr = _mm_add_epi64(ctr, one64); \ - bits##n = _mm_shuffle_epi8(ctr, bswap64); + bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), keys[0]); InitAllBits: - INIT_BITS(0); - INIT_BITS(1); - INIT_BITS(2); - INIT_BITS(3); - INIT_BITS(4); - INIT_BITS(5); + INIT_BITS(0, ctx->ecb.keys); + INIT_BITS(1, bits1keys); + INIT_BITS(2, ctx->ecb.keys); + INIT_BITS(3, ctx->ecb.keys); + INIT_BITS(4, ctx->ecb.keys); + INIT_BITS(5, ctx->ecb.keys); #undef INIT_BITS } From e68d6a32596601649691212154e4dbf198077258 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 12 May 2020 15:02:07 +0900 Subject: [PATCH 33/60] handle non-zero vectors --- lib/fusion.c | 9 ++++----- t/fusion.c | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 90b0bdcfb..b51e6b066 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -288,10 +288,9 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, #define STATE_SUPP_IN_PROCESS 0x4 /* build counter */ - ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); - ctr = _mm_shuffle_epi8(ctr, bswap8); - ctr = _mm_add_epi64(ctr, one64); - ek0 = _mm_shuffle_epi8(ctr, bswap64); + ek0 = loadn(iv, PTLS_AESGCM_IV_SIZE); + ek0 = _mm_insert_epi16(ek0, 0x100, 7); + ctr = _mm_shuffle_epi8(ek0, bswap64); /* prepare the first bit stream */ AESECB6_INIT(); @@ -453,7 +452,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, /* build counter */ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); - ctr = _mm_shuffle_epi8(ctr, bswap8); + ctr = _mm_shuffle_epi8(ctr, bswap64); /* schedule ek0 and suppkey */ ctr = _mm_add_epi64(ctr, one64); diff --git a/t/fusion.c b/t/fusion.c index c9587600f..d5d34c79e 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -122,6 +122,8 @@ int main(int argc, char **argv) DOIT(zero, zero, 0, 80, "98885a3a22bd4742fe7b72172193b163"); DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); + DOIT(zero, zero, 20, 85, "afe8b727057c804a0525c2914ef856b0"); + #undef DOIT ecb = malloc(sizeof(*ecb)); @@ -133,5 +135,30 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_destroy(aead); } + { + static const uint8_t key[16] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, + aad[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + iv[] = {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, + plaintext[] = + "hello world\nhello world\nhello world\nhello world\nhello world\nhello world\nhello world\n"; + static const uint8_t expected[] = {0xd3, 0xa8, 0x1d, 0x96, 0x4c, 0x9b, 0x02, 0xd7, 0x9a, 0xb0, 0x41, 0x07, 0x4c, 0x8c, 0xe2, + 0xe0, 0x2e, 0x83, 0x54, 0x52, 0x45, 0xcb, 0xd4, 0x68, 0xc8, 0x43, 0x45, 0xca, 0x91, 0xfb, + 0xa3, 0x7a, 0x67, 0xed, 0xe8, 0xd7, 0x5e, 0xe2, 0x33, 0xd1, 0x3e, 0xbf, 0x50, 0xc2, 0x4b, + 0x86, 0x83, 0x55, 0x11, 0xbb, 0x17, 0x4f, 0xf5, 0x78, 0xb8, 0x65, 0xeb, 0x9a, 0x2b, 0x8f, + 0x77, 0x08, 0xa9, 0x60, 0x17, 0x73, 0xc5, 0x07, 0xf3, 0x04, 0xc9, 0x3f, 0x67, 0x4d, 0x12, + 0xa1, 0x02, 0x93, 0xc2, 0x3c, 0xd3, 0xf8, 0x59, 0x33, 0xd5, 0x01, 0xc3, 0xbb, 0xaa, 0xe6, + 0x3f, 0xbb, 0x23, 0x66, 0x94, 0x26, 0x28, 0x43, 0xa5, 0xfd, 0x2f}; + ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(key, sizeof(aad) + sizeof(plaintext)); + uint8_t encrypted[sizeof(plaintext) + 16], decrypted[sizeof(plaintext)]; + ptls_fusion_aesgcm_encrypt(aead, encrypted, plaintext, sizeof(plaintext), iv, aad, sizeof(aad), NULL, NULL); + ok(memcmp(expected, encrypted, sizeof(plaintext)) == 0); + ok(memcmp(expected + sizeof(plaintext), encrypted + sizeof(plaintext), 16) == 0); + ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, sizeof(plaintext), iv, aad, sizeof(aad), + encrypted + sizeof(plaintext), NULL, NULL)); + ok(memcmp(decrypted, plaintext, sizeof(plaintext)) == 0); + ptls_fusion_aesgcm_destroy(aead); + } + return done_testing(); } From 977cf3de5fc639ef602a20b85e79893b5e71a6f8 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 12 May 2020 16:00:06 +0900 Subject: [PATCH 34/60] follow the API change --- t/fusionbench.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t/fusionbench.c b/t/fusionbench.c index 2b6711a1c..e6db38798 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -58,8 +58,9 @@ int main(int argc, char **argv) for (int i = 0; i < count; ++i) ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), suppkey, suppvec); } else { + uint8_t tag[16] = {}; for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), suppkey, suppvec); + ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), &tag, suppkey, suppvec); } for (int i = 0; i < 16; ++i) From 1cf91f68f120eb76ac335c7a0c804163f1ef7793 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Wed, 13 May 2020 15:11:14 +0900 Subject: [PATCH 35/60] delay supplementary operation until the dependent region of the AES-GCM output is obtained, remove support for supplementary operation on the decryption size (as it is impossible to use when processing one packet) --- include/picotls.h | 51 ++++++++---------- include/picotls/fusion.h | 13 ++--- lib/cifra/aes-common.h | 4 +- lib/cifra/chacha20.c | 4 +- lib/fusion.c | 111 +++++++++++++++++---------------------- lib/openssl.c | 4 +- lib/picotls.c | 4 +- t/fusion.c | 79 ++++++++++++++-------------- t/fusionbench.c | 13 ++--- 9 files changed, 125 insertions(+), 158 deletions(-) diff --git a/include/picotls.h b/include/picotls.h index 46c22148d..f4628cb22 100644 --- a/include/picotls.h +++ b/include/picotls.h @@ -299,6 +299,12 @@ typedef const struct st_ptls_cipher_algorithm_t { int (*setup_crypto)(ptls_cipher_context_t *ctx, int is_enc, const void *key); } ptls_cipher_algorithm_t; +typedef struct st_ptls_aead_supplementary_encryption_t { + ptls_cipher_context_t *ctx; + const void *input; + uint8_t output[16]; +} ptls_aead_supplementary_encryption_t; + /** * AEAD context. AEAD implementations are allowed to stuff data at the end of the struct. The size of the memory allocated for the * struct is governed by ptls_aead_algorithm_t::context_size. @@ -312,9 +318,9 @@ typedef struct st_ptls_aead_context_t { size_t (*do_encrypt_update)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen); size_t (*do_encrypt_final)(struct st_ptls_aead_context_t *ctx, void *output); void (*do_encrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); size_t (*do_decrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); + const void *aad, size_t aadlen); } ptls_aead_context_t; /** @@ -1196,7 +1202,7 @@ void ptls_aead_free(ptls_aead_context_t *ctx); static size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen); void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); + size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * initializes the internal state of the encryptor */ @@ -1217,8 +1223,6 @@ static size_t ptls_aead_encrypt_final(ptls_aead_context_t *ctx, void *output); */ static size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen); -static size_t ptls_aead_decrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); /** * Return the current read epoch. */ @@ -1246,10 +1250,6 @@ int ptls_client_handle_message(ptls_t *tls, ptls_buffer_t *sendbuf, size_t epoch size_t inlen, ptls_handshake_properties_t *properties); int ptls_server_handle_message(ptls_t *tls, ptls_buffer_t *sendbuf, size_t epoch_offsets[5], size_t in_epoch, const void *input, size_t inlen, ptls_handshake_properties_t *properties); -/** - * internal - */ -static void ptls_aead__encrypt_supp(ptls_cipher_context_t *ctx, void *vec); /** * internal */ @@ -1258,7 +1258,7 @@ void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq); * */ static void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec); + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * internal */ @@ -1381,19 +1381,10 @@ inline void ptls_cipher_encrypt(ptls_cipher_context_t *ctx, void *output, const ctx->do_transform(ctx, output, input, len); } -inline void ptls_aead__encrypt_supp(ptls_cipher_context_t *ctx, void *vec) -{ - if (ctx != NULL) { - ptls_cipher_init(ctx, vec); - memset(vec, 0, ctx->algo->iv_size); - ptls_cipher_encrypt(ctx, vec, vec, ctx->algo->iv_size); - } -} - inline size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { - ptls_aead_encrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL, NULL); + ptls_aead_encrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL); return inlen + ctx->algo->tag_size; } @@ -1416,26 +1407,26 @@ inline size_t ptls_aead_encrypt_final(ptls_aead_context_t *ctx, void *output) } inline void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { ctx->do_encrypt_init(ctx, iv, aad, aadlen); ctx->do_encrypt_update(ctx, output, input, inlen); ctx->do_encrypt_final(ctx, (uint8_t *)output + inlen); -} -inline size_t ptls_aead_decrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) -{ - uint8_t iv[PTLS_MAX_IV_SIZE]; - - ptls_aead__build_iv(ctx, iv, seq); - return ctx->do_decrypt(ctx, output, input, inlen, iv, aad, aadlen, suppkey, suppvec); + if (supp != NULL) { + ptls_cipher_init(supp->ctx, supp->input); + memset(supp->output, 0, sizeof(supp->output)); + ptls_cipher_encrypt(supp->ctx, supp->output, supp->output, sizeof(supp->output)); + } } inline size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { - return ptls_aead_decrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL, NULL); + uint8_t iv[PTLS_MAX_IV_SIZE]; + + ptls_aead__build_iv(ctx, iv, seq); + return ctx->do_decrypt(ctx, output, input, inlen, iv, aad, aadlen); } #define ptls_define_hash(name, ctx_type, init_func, update_func, final_func) \ diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index dfa4fb5d5..f7fb8eccf 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -60,27 +60,24 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); * @param iv initialization vector of 12 bytes * @param aad AAD * @param aadlen size of AAD - * @param suppkey (optional) points to an AES-ECB context used for generating suppvec - * @param suppvec (optional) vector to be encrypted using suppkey + * @param supp (optional) supplementary encryption context */ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec); + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * Decrypts an AEAD block, an in parallel, optionally encrypts one block using AES-ECB. Returns if decryption was successful. * @param iv initialization vector of 12 bytes * @param output output buffer - * @param src payload to be decrypted + * @param input payload to be decrypted * @param inlen size of the payload to be decrypted * @param aad AAD * @param aadlen size of AAD * @param tag the AEAD tag being received from peer - * @param suppkey (optional) points to an AES-ECB context used for generating suppvec - * @param suppvec (optional) vector to be encrypted using suppkey */ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, - void *suppvec); + const void *aad, size_t aadlen, const void *tag); +extern ptls_cipher_algorithm_t ptls_fusion_aes128ctr; extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; #ifdef __cplusplus diff --git a/lib/cifra/aes-common.h b/lib/cifra/aes-common.h index a0031987c..eb1007c69 100644 --- a/lib/cifra/aes-common.h +++ b/lib/cifra/aes-common.h @@ -133,10 +133,8 @@ static inline size_t aesgcm_encrypt_final(ptls_aead_context_t *_ctx, void *outpu } static inline size_t aesgcm_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen) { - ptls_aead__encrypt_supp(suppkey, suppvec); - struct aesgcm_context_t *ctx = (struct aesgcm_context_t *)_ctx; if (inlen < PTLS_AESGCM_TAG_SIZE) diff --git a/lib/cifra/chacha20.c b/lib/cifra/chacha20.c index 3e624b5ba..9451c6526 100644 --- a/lib/cifra/chacha20.c +++ b/lib/cifra/chacha20.c @@ -150,14 +150,12 @@ static size_t chacha20poly1305_encrypt_final(ptls_aead_context_t *_ctx, void *ou } static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen) { struct chacha20poly1305_context_t *ctx = (struct chacha20poly1305_context_t *)_ctx; uint8_t tag[PTLS_CHACHA20POLY1305_TAG_SIZE]; size_t ret; - ptls_aead__encrypt_supp(suppkey, suppvec); - if (inlen < sizeof(tag)) return SIZE_MAX; diff --git a/lib/fusion.c b/lib/fusion.c index b51e6b066..572a23fc2 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -54,6 +54,18 @@ struct ptls_fusion_aesgcm_context { } ghash[0]; }; +struct ctr_context { + ptls_cipher_context_t super; + ptls_fusion_aesecb_context_t fusion; + __m128i bits; + uint8_t is_ready; +}; + +struct aesgcm_context { + ptls_aead_context_t super; + ptls_fusion_aesgcm_context_t *aesgcm; +}; + static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; #define poly (*(__m128i *)poly_) static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; @@ -199,7 +211,7 @@ static inline void storen(void *_p, size_t l, __m128i v) } void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *_aad, size_t aadlen, ptls_fusion_aesecb_context_t *suppkey, void *suppvec) + const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ #define AESECB6_INIT() \ @@ -222,10 +234,9 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, bits5 = ek0; \ state |= STATE_EK0_BEEN_FED; \ } \ - if (suppkey != NULL && srclen <= 16 * 4) { \ - bits4 = _mm_loadu_si128(suppvec); \ - bits4keys = suppkey->keys; \ - suppkey = NULL; \ + if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \ + bits4 = _mm_loadu_si128(supp->input); \ + bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys; \ state |= STATE_SUPP_IN_PROCESS; \ } \ } \ @@ -263,7 +274,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } while (0) __m128i ctr, ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); - const __m128i *bits4keys = ctx->ecb.keys; /* is changed to suppkey->keys when calcurating suppout */ + const __m128i *bits4keys = ctx->ecb.keys; /* is changed to supp->ctx.keys when calcurating suppout */ struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); @@ -281,11 +292,12 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1; - int32_t state = 0; #define STATE_EK0_BEEN_FED 0x3 #define STATE_EK0_INCOMPLETE 0x2 #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) -#define STATE_SUPP_IN_PROCESS 0x4 +#define STATE_SUPP_USED 0x4 +#define STATE_SUPP_IN_PROCESS 0x8 + int32_t state = supp != NULL ? STATE_SUPP_USED : 0; /* build counter */ ek0 = loadn(iv, PTLS_AESGCM_IV_SIZE); @@ -319,8 +331,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, state &= ~STATE_EK0_INCOMPLETE; } if ((state & STATE_SUPP_IN_PROCESS) != 0) { - _mm_storeu_si128(suppvec, bits4); - state &= ~STATE_SUPP_IN_PROCESS; + _mm_storeu_si128((__m128i *)supp->output, bits4); + state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS); } if (srclen != 0) { #define APPLY(i) \ @@ -405,24 +417,33 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, Finish: gdatabuf[gdata_cnt++] = ac; - /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation (GHASH and possibly suppvec), and - * exit the loop. + /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation. * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead. */ assert(STATE_EK0_READY()); - assert(suppkey == NULL); - if ((state & STATE_SUPP_IN_PROCESS) != 0) { - for (size_t i = 2; i <= 9; ++i) - bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); - bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); - _mm_storeu_si128(suppvec, bits4); - } for (size_t i = 0; i < gdata_cnt; ++i) gfmul_onestep(&gstate, gdatabuf[i], --ghash_precompute); _mm_storeu_si128(dst, gfmul_final(&gstate, ek0)); + /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */ + if ((state & STATE_SUPP_USED) != 0) { + size_t i; + if ((state & STATE_SUPP_IN_PROCESS) == 0) { + bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys; + bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]); + i = 1; + } else { + i = 2; + } + do { + bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]); + } while (i != 10); + bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); + _mm_storeu_si128((__m128i *)supp->output, bits4); + } + #undef AESECB6_INIT #undef AESECB6_UPDATE #undef AESECB6_FINAL @@ -432,12 +453,10 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *_aad, size_t aadlen, const void *tag, ptls_fusion_aesecb_context_t *suppkey, - void *suppvec) + const void *_aad, size_t aadlen, const void *tag) { __m128i ctr, ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(), bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); - const __m128i *bits1keys; struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); @@ -458,13 +477,6 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, ctr = _mm_add_epi64(ctr, one64); bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), ctx->ecb.keys[0]); ++nondata_aes_cnt; - if (suppkey != NULL) { - bits1keys = suppkey->keys; - bits1 = _mm_loadu_si128(suppvec); - ++nondata_aes_cnt; - } else { - bits1keys = ctx->ecb.keys; - } #define STATE_IS_FIRST_RUN 0x1 #define STATE_GHASH_HAS_MORE 0x2 @@ -527,7 +539,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), keys[0]); InitAllBits: INIT_BITS(0, ctx->ecb.keys); - INIT_BITS(1, bits1keys); + INIT_BITS(1, ctx->ecb.keys); INIT_BITS(2, ctx->ecb.keys); INIT_BITS(3, ctx->ecb.keys); INIT_BITS(4, ctx->ecb.keys); @@ -540,7 +552,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, do { \ __m128i k = ctx->ecb.keys[i]; \ bits0 = _mm_aesenc_si128(bits0, k); \ - bits1 = _mm_aesenc_si128(bits1, bits1keys[i]); \ + bits1 = _mm_aesenc_si128(bits1, k); \ bits2 = _mm_aesenc_si128(bits2, k); \ bits3 = _mm_aesenc_si128(bits3, k); \ bits4 = _mm_aesenc_si128(bits4, k); \ @@ -556,7 +568,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, AESECB6_UPDATE(aesi); __m128i k = ctx->ecb.keys[aesi]; bits0 = _mm_aesenclast_si128(bits0, k); - bits1 = _mm_aesenclast_si128(bits1, bits1keys[aesi]); + bits1 = _mm_aesenclast_si128(bits1, k); bits2 = _mm_aesenclast_si128(bits2, k); bits3 = _mm_aesenclast_si128(bits3, k); bits4 = _mm_aesenclast_si128(bits4, k); @@ -581,10 +593,6 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } else { if ((state & STATE_IS_FIRST_RUN) != 0) { ek0 = bits0; - if (suppkey != NULL) { - _mm_store_si128(suppvec, bits1); - bits1keys = ctx->ecb.keys; - } state &= ~STATE_IS_FIRST_RUN; } switch (nondata_aes_cnt) { @@ -703,13 +711,6 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx) free(ctx); } -struct ctr_context { - ptls_cipher_context_t super; - ptls_fusion_aesecb_context_t fusion; - __m128i bits; - uint8_t is_ready; -}; - static void ctr_dispose(ptls_cipher_context_t *_ctx) { struct ctr_context *ctx = (struct ctr_context *)_ctx; @@ -752,11 +753,6 @@ static int aes128ctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void * return 0; } -struct aesgcm_context { - ptls_aead_context_t super; - ptls_fusion_aesgcm_context_t *aesgcm; -}; - static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; @@ -782,29 +778,18 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) } void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { - ptls_fusion_aesgcm_context_t *aesgcm = ((struct aesgcm_context *)_ctx)->aesgcm; - ptls_fusion_aesecb_context_t *aesecb = suppkey != NULL ? &((struct ctr_context *)suppkey)->fusion : NULL; - - ptls_fusion_aesgcm_encrypt(aesgcm, output, input, inlen, iv, aad, aadlen, aesecb, suppvec); + ptls_fusion_aesgcm_encrypt(((struct aesgcm_context *)_ctx)->aesgcm, output, input, inlen, iv, aad, aadlen, supp); } static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen) { ptls_fusion_aesgcm_context_t *aesgcm = ((struct aesgcm_context *)_ctx)->aesgcm; - struct ptls_fusion_aesecb_context *aesecb = suppkey != NULL ? &((struct ctr_context *)suppkey)->fusion : NULL; - - if (inlen < 16) { - if (suppkey != NULL) - _mm_storeu_si128(suppvec, aesecb_encrypt(aesecb, _mm_loadu_si128(suppvec))); - return SIZE_MAX; - } size_t enclen = inlen - 16; - if (!ptls_fusion_aesgcm_decrypt(aesgcm, output, input, enclen, iv, aad, aadlen, (const uint8_t *)input + enclen, aesecb, - suppvec)) + if (!ptls_fusion_aesgcm_decrypt(aesgcm, output, input, enclen, iv, aad, aadlen, (const uint8_t *)input + enclen)) return SIZE_MAX; return enclen; } diff --git a/lib/openssl.c b/lib/openssl.c index 863628e17..f494a3ec5 100644 --- a/lib/openssl.c +++ b/lib/openssl.c @@ -823,10 +823,8 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) } static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, const void *iv, - const void *aad, size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + const void *aad, size_t aadlen) { - ptls_aead__encrypt_supp(suppkey, suppvec); - struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; uint8_t *output = _output; size_t off = 0, tag_size = ctx->super.algo->tag_size; diff --git a/lib/picotls.c b/lib/picotls.c index f0bd6a6b3..f326fa6e7 100644 --- a/lib/picotls.c +++ b/lib/picotls.c @@ -5135,12 +5135,12 @@ void ptls_aead_free(ptls_aead_context_t *ctx) } void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen, ptls_cipher_context_t *suppkey, void *suppvec) + size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { uint8_t iv[PTLS_MAX_IV_SIZE]; ptls_aead__build_iv(ctx, iv, seq); - ctx->do_encrypt(ctx, output, input, inlen, iv, aad, aadlen, suppkey, suppvec); + ctx->do_encrypt(ctx, output, input, inlen, iv, aad, aadlen, supp); } void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq) diff --git a/t/fusion.c b/t/fusion.c index d5d34c79e..d23103bb9 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -57,11 +57,11 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, zero, "hello", 5, NULL, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, zero, "hello", 5, NULL); ok(memcmp(expected, encrypted, sizeof(expected)) == 0); memset(decrypted, 0x55, sizeof(decrypted)); - ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, zero, "hello", 5, expected + 16, NULL, NULL)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, zero, "hello", 5, expected + 16)); ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); ptls_fusion_aesgcm_destroy(ctx); @@ -72,66 +72,65 @@ int main(int argc, char **argv) 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); uint8_t encrypted[17], decrypted[1] = {0x55}; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, zero, "a", 1, NULL, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, zero, "a", 1, NULL); ok(memcmp(expected, encrypted, 17) == 0); - ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, zero, "a", 1, expected + 1, NULL, NULL)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, zero, "a", 1, expected + 1)); ok('X' == decrypted[0]); ptls_fusion_aesgcm_destroy(ctx); } { ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(zero, sizeof(zero)); - ptls_fusion_aesecb_context_t *ecb = NULL; + ptls_aead_supplementary_encryption_t *supp = NULL; for (int i = 0; i < 2; ++i) { - uint8_t encrypted[sizeof(zero) + 16], ecbvec[16], decrypted[sizeof(zero)]; -#define DOIT(iv, aad, aadlen, ptlen, expected_tag) \ + uint8_t encrypted[sizeof(zero) + 16], decrypted[sizeof(zero)]; +#define DOIT(iv, aad, aadlen, ptlen, expected_tag, expected_supp) \ do { \ - memset(ecbvec, 0, sizeof(ecbvec)); \ - ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, iv, aad, aadlen, ecb, &ecbvec); \ + memset(encrypted, 0xcc, sizeof(encrypted)); \ + ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, iv, aad, aadlen, supp); \ + printf("%s\n", tostr(encrypted + ptlen, 16)); \ ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ - if (i == 0) { \ - ok(memcmp(ecbvec, zero, sizeof(ecbvec)) == 0); \ - } else { \ - ok(strcmp(tostr(ecbvec, sizeof(ecbvec)), "b6aeaffa752dc08b51639731761aed00") == 0); \ - } \ + if (supp != NULL) \ + ok(strcmp(tostr(supp->output, sizeof(supp->output)), expected_supp) == 0); \ memset(decrypted, 0x55, sizeof(decrypted)); \ - ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, ptlen, iv, aad, aadlen, encrypted + ptlen, NULL, NULL)); \ + ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, ptlen, iv, aad, aadlen, encrypted + ptlen)); \ ok(memcmp(decrypted, zero, ptlen) == 0); \ } while (0) - DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf"); - DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8"); - DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b"); - DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd"); - DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9"); - DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd"); - DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78"); - DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d"); - DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42"); - DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8"); + DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf", "4576f18ef3ae9dfd37cf72c4592da874"); + DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316"); - DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253"); - DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef"); + DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890"); - DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789"); - DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc"); + DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 0, 80, "98885a3a22bd4742fe7b72172193b163"); - DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc"); + DOIT(zero, zero, 0, 80, "98885a3a22bd4742fe7b72172193b163", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 20, 85, "afe8b727057c804a0525c2914ef856b0"); + DOIT(zero, zero, 20, 85, "afe8b727057c804a0525c2914ef856b0", "a062016e90dcc316d061fde5424cf34f"); #undef DOIT - ecb = malloc(sizeof(*ecb)); - ptls_fusion_aesecb_init(ecb, one); + supp = malloc(sizeof(*supp)); + supp->ctx = ptls_cipher_new(&ptls_fusion_aes128ctr, 1, one); + supp->input = encrypted + 2; } - ptls_fusion_aesecb_dispose(ecb); - free(ecb); + ptls_cipher_free(supp->ctx); + free(supp); ptls_fusion_aesgcm_destroy(aead); } @@ -151,11 +150,11 @@ int main(int argc, char **argv) 0x3f, 0xbb, 0x23, 0x66, 0x94, 0x26, 0x28, 0x43, 0xa5, 0xfd, 0x2f}; ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(key, sizeof(aad) + sizeof(plaintext)); uint8_t encrypted[sizeof(plaintext) + 16], decrypted[sizeof(plaintext)]; - ptls_fusion_aesgcm_encrypt(aead, encrypted, plaintext, sizeof(plaintext), iv, aad, sizeof(aad), NULL, NULL); + ptls_fusion_aesgcm_encrypt(aead, encrypted, plaintext, sizeof(plaintext), iv, aad, sizeof(aad), NULL); ok(memcmp(expected, encrypted, sizeof(plaintext)) == 0); ok(memcmp(expected + sizeof(plaintext), encrypted + sizeof(plaintext), 16) == 0); ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, sizeof(plaintext), iv, aad, sizeof(aad), - encrypted + sizeof(plaintext), NULL, NULL)); + encrypted + sizeof(plaintext))); ok(memcmp(decrypted, plaintext, sizeof(plaintext)) == 0); ptls_fusion_aesgcm_destroy(aead); } diff --git a/t/fusionbench.c b/t/fusionbench.c index e6db38798..f2f35aa53 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -9,8 +9,7 @@ int main(int argc, char **argv) { static const uint8_t key[16] = {}, iv[12] = {}, aad[13] = {}; size_t textlen = 16384; - ptls_fusion_aesecb_context_t *suppkey = NULL; - uint8_t suppvec[16] = {}; + ptls_aead_supplementary_encryption_t *supp = NULL; int ch, decrypt = 0, count = 1000000; while ((ch = getopt(argc, argv, "b:dn:sh")) != -1) { @@ -32,8 +31,8 @@ int main(int argc, char **argv) break; case 's': { static const uint8_t k[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - suppkey = malloc(sizeof(*suppkey)); - ptls_fusion_aesecb_init(suppkey, k); + supp = malloc(sizeof(*supp)); + supp->ctx = ptls_cipher_new(&ptls_fusion_aes128ctr, 1, k); } break; default: printf("Usage: %s -b -s\n" @@ -51,16 +50,18 @@ int main(int argc, char **argv) uint8_t *text = malloc(textlen + 16); memset(text, 0, textlen + 16); + if (supp != NULL) + supp->input = textlen >= 2 ? text + 2 : text + textlen; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); if (!decrypt) { for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), suppkey, suppvec); + ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), supp); } else { uint8_t tag[16] = {}; for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), &tag, suppkey, suppvec); + ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), &tag); } for (int i = 0; i < 16; ++i) From 02ca0f0825b0fb0764af8752f3c7b3ee8a42c3df Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Wed, 13 May 2020 20:46:39 +0900 Subject: [PATCH 36/60] we can make it a contractual obligation that IV can be loaded as 16-byte value --- include/picotls/fusion.h | 2 +- lib/fusion.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index f7fb8eccf..80ff3560b 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -57,7 +57,7 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); * @param output output buffer * @param input payload to be encrypted * @param inlen size of the payload to be encrypted - * @param iv initialization vector of 12 bytes + * @param iv initialization vector of 12 bytes (must be accessible as a 16-byte value) * @param aad AAD * @param aadlen size of AAD * @param supp (optional) supplementary encryption context diff --git a/lib/fusion.c b/lib/fusion.c index 572a23fc2..080bfbb03 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include "picotls.h" #include "picotls/fusion.h" @@ -300,8 +301,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, int32_t state = supp != NULL ? STATE_SUPP_USED : 0; /* build counter */ - ek0 = loadn(iv, PTLS_AESGCM_IV_SIZE); - ek0 = _mm_insert_epi16(ek0, 0x100, 7); + ek0 = _mm_loadu_si128(iv); + ek0 = _mm_insert_epi32(ek0, 0x1000000, 3); ctr = _mm_shuffle_epi8(ek0, bswap64); /* prepare the first bit stream */ From 079b1d036836822eb3cdfbab761162ee12217450 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 02:24:28 +0900 Subject: [PATCH 37/60] use 128-bit load when the entire data is on the same page --- lib/fusion.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 080bfbb03..12d5f9cd1 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -193,12 +193,18 @@ static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i static inline __m128i loadn(const void *_p, size_t l) { - const uint8_t *p = _p; - uint8_t buf[16] = {}; - - for (size_t i = 0; i != l; ++i) - buf[i] = p[i]; - return *(__m128i *)buf; + /* FIXME is this optimal? */ + if (PTLS_LIKELY(((uintptr_t)_p % 4096) <= 4080)) { + static const uint8_t mask[31] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + return _mm_and_si128(_mm_loadu_si128(_p), _mm_loadu_si128((__m128i *)(mask + 16 - l))); + } else { + const uint8_t *p = _p; + uint8_t buf[16] = {}; + for (size_t i = 0; i != l; ++i) + buf[i] = p[i]; + return *(__m128i *)buf; + } } static inline void storen(void *_p, size_t l, __m128i v) @@ -343,6 +349,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, srclen -= 16; \ } else { \ if (srclen != 0) { \ + /* While it is possible to use _mm_storeu_si128 here, as there is space to store GCM tag, writing byte-per-byte \ + * seems to be faster on 9th gen Core. */ \ storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits##i)); \ dst = (__m128i *)((uint8_t *)dst + srclen); \ srclen = 0; \ From 56c572a098c780482eabb7649e22f6e91e9e2e22 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 06:26:33 +0900 Subject: [PATCH 38/60] add API for initializing AEAD directly --- include/picotls.h | 7 +++++++ lib/picotls.c | 41 +++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/include/picotls.h b/include/picotls.h index f4628cb22..1eabd9cb4 100644 --- a/include/picotls.h +++ b/include/picotls.h @@ -1192,6 +1192,13 @@ static void ptls_cipher_encrypt(ptls_cipher_context_t *ctx, void *output, const */ ptls_aead_context_t *ptls_aead_new(ptls_aead_algorithm_t *aead, ptls_hash_algorithm_t *hash, int is_enc, const void *secret, const char *label_prefix); +/** + * instantiates an AEAD cipher given key and iv + * @param aead + * @param is_enc 1 if creating a context for encryption, 0 if creating a context for decryption + * @return pointer to an AEAD context if successful, otherwise NULL + */ +ptls_aead_context_t *ptls_aead_new_direct(ptls_aead_algorithm_t *aead, int is_enc, const void *key, const void *iv); /** * destroys an AEAD cipher context */ diff --git a/lib/picotls.c b/lib/picotls.c index f326fa6e7..ec287cf16 100644 --- a/lib/picotls.c +++ b/lib/picotls.c @@ -5096,28 +5096,18 @@ void ptls_cipher_free(ptls_cipher_context_t *ctx) ptls_aead_context_t *new_aead(ptls_aead_algorithm_t *aead, ptls_hash_algorithm_t *hash, int is_enc, const void *secret, ptls_iovec_t hash_value, const char *label_prefix) { - ptls_aead_context_t *ctx; - uint8_t key[PTLS_MAX_SECRET_SIZE]; + ptls_aead_context_t *ctx = NULL; + uint8_t key_iv[aead->key_size + aead->iv_size]; int ret; - if ((ctx = (ptls_aead_context_t *)malloc(aead->context_size)) == NULL) - return NULL; - - *ctx = (ptls_aead_context_t){aead}; - if ((ret = get_traffic_key(hash, key, aead->key_size, 0, secret, hash_value, label_prefix)) != 0) + if ((ret = get_traffic_key(hash, key_iv, aead->key_size, 0, secret, hash_value, label_prefix)) != 0) goto Exit; - if ((ret = get_traffic_key(hash, ctx->static_iv, aead->iv_size, 1, secret, hash_value, label_prefix)) != 0) + if ((ret = get_traffic_key(hash, key_iv + aead->key_size, aead->iv_size, 1, secret, hash_value, label_prefix)) != 0) goto Exit; - ret = aead->setup_crypto(ctx, is_enc, key); + ctx = ptls_aead_new_direct(aead, is_enc, key_iv, key_iv + aead->key_size); Exit: - ptls_clear_memory(key, aead->key_size); - if (ret != 0) { - ptls_clear_memory(ctx->static_iv, aead->iv_size); - free(ctx); - ctx = NULL; - } - + ptls_clear_memory(key_iv, sizeof(key_iv)); return ctx; } @@ -5127,6 +5117,25 @@ ptls_aead_context_t *ptls_aead_new(ptls_aead_algorithm_t *aead, ptls_hash_algori return new_aead(aead, hash, is_enc, secret, ptls_iovec_init(NULL, 0), label_prefix); } +ptls_aead_context_t *ptls_aead_new_direct(ptls_aead_algorithm_t *aead, int is_enc, const void *key, const void *iv) +{ + ptls_aead_context_t *ctx; + + if ((ctx = (ptls_aead_context_t *)malloc(aead->context_size)) == NULL) + return NULL; + + *ctx = (ptls_aead_context_t){aead}; + memcpy(ctx->static_iv, iv, aead->iv_size); + + if (aead->setup_crypto(ctx, is_enc, key) != 0) { + ptls_clear_memory(ctx->static_iv, aead->iv_size); + free(ctx); + return NULL; + } + + return ctx; +} + void ptls_aead_free(ptls_aead_context_t *ctx) { ctx->dispose_crypto(ctx); From ba2b960cff46ae12af1016369aa4409692289792 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 08:21:39 +0900 Subject: [PATCH 39/60] let AEAD impls retain static_iv themselves using the formats they prefer --- include/picotls.h | 39 +++++++------- include/picotls/fusion.h | 8 +-- lib/cifra/aes-common.h | 23 +++++--- lib/cifra/aes128.c | 8 +-- lib/cifra/aes256.c | 8 +-- lib/cifra/chacha20.c | 12 +++-- lib/fusion.c | 114 +++++++++++++++++++++------------------ lib/openssl.c | 26 +++++---- lib/picotls.c | 20 ++----- t/fusion.c | 61 +++++++++++---------- t/fusionbench.c | 6 +-- 11 files changed, 166 insertions(+), 159 deletions(-) diff --git a/include/picotls.h b/include/picotls.h index 1eabd9cb4..1dabfef9a 100644 --- a/include/picotls.h +++ b/include/picotls.h @@ -311,15 +311,14 @@ typedef struct st_ptls_aead_supplementary_encryption_t { */ typedef struct st_ptls_aead_context_t { const struct st_ptls_aead_algorithm_t *algo; - uint8_t static_iv[PTLS_MAX_IV_SIZE]; /* field above this line must not be altered by the crypto binding */ void (*dispose_crypto)(struct st_ptls_aead_context_t *ctx); - void (*do_encrypt_init)(struct st_ptls_aead_context_t *ctx, const void *iv, const void *aad, size_t aadlen); + void (*do_encrypt_init)(struct st_ptls_aead_context_t *ctx, uint64_t seq, const void *aad, size_t aadlen); size_t (*do_encrypt_update)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen); size_t (*do_encrypt_final)(struct st_ptls_aead_context_t *ctx, void *output); - void (*do_encrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + void (*do_encrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); - size_t (*do_decrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, + size_t (*do_decrypt)(struct st_ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen); } ptls_aead_context_t; @@ -359,7 +358,7 @@ typedef const struct st_ptls_aead_algorithm_t { /** * callback that sets up the crypto */ - int (*setup_crypto)(ptls_aead_context_t *ctx, int is_enc, const void *key); + int (*setup_crypto)(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv); } ptls_aead_algorithm_t; /** @@ -1208,8 +1207,8 @@ void ptls_aead_free(ptls_aead_context_t *ctx); */ static size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen); -void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen, ptls_aead_supplementary_encryption_t *supp); +static void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * initializes the internal state of the encryptor */ @@ -1260,11 +1259,11 @@ int ptls_server_handle_message(ptls_t *tls, ptls_buffer_t *sendbuf, size_t epoch /** * internal */ -void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq); +void ptls_aead__build_iv(ptls_aead_algorithm_t *algo, uint8_t *iv, const uint8_t *static_iv, uint64_t seq); /** * */ -static void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +static void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * internal @@ -1391,16 +1390,19 @@ inline void ptls_cipher_encrypt(ptls_cipher_context_t *ctx, void *output, const inline size_t ptls_aead_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { - ptls_aead_encrypt_s(ctx, output, input, inlen, seq, aad, aadlen, NULL); + ctx->do_encrypt(ctx, output, input, inlen, seq, aad, aadlen, NULL); return inlen + ctx->algo->tag_size; } -inline void ptls_aead_encrypt_init(ptls_aead_context_t *ctx, uint64_t seq, const void *aad, size_t aadlen) +inline void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, + const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { - uint8_t iv[PTLS_MAX_IV_SIZE]; + ctx->do_encrypt(ctx, output, input, inlen, seq, aad, aadlen, supp); +} - ptls_aead__build_iv(ctx, iv, seq); - ctx->do_encrypt_init(ctx, iv, aad, aadlen); +inline void ptls_aead_encrypt_init(ptls_aead_context_t *ctx, uint64_t seq, const void *aad, size_t aadlen) +{ + ctx->do_encrypt_init(ctx, seq, aad, aadlen); } inline size_t ptls_aead_encrypt_update(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen) @@ -1413,10 +1415,10 @@ inline size_t ptls_aead_encrypt_final(ptls_aead_context_t *ctx, void *output) return ctx->do_encrypt_final(ctx, output); } -inline void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +inline void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { - ctx->do_encrypt_init(ctx, iv, aad, aadlen); + ctx->do_encrypt_init(ctx, seq, aad, aadlen); ctx->do_encrypt_update(ctx, output, input, inlen); ctx->do_encrypt_final(ctx, (uint8_t *)output + inlen); @@ -1430,10 +1432,7 @@ inline void ptls_aead__do_encrypt(ptls_aead_context_t *ctx, void *output, const inline size_t ptls_aead_decrypt(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { - uint8_t iv[PTLS_MAX_IV_SIZE]; - - ptls_aead__build_iv(ctx, iv, seq); - return ctx->do_decrypt(ctx, output, input, inlen, iv, aad, aadlen); + return ctx->do_decrypt(ctx, output, input, inlen, seq, aad, aadlen); } #define ptls_define_hash(name, ctx_type, init_func, update_func, final_func) \ diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 80ff3560b..fede03d57 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -56,13 +56,13 @@ void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); * @param ctx context * @param output output buffer * @param input payload to be encrypted - * @param inlen size of the payload to be encrypted - * @param iv initialization vector of 12 bytes (must be accessible as a 16-byte value) + * @param inlen size of the payload to be encrypted + * @param counter * @param aad AAD * @param aadlen size of AAD * @param supp (optional) supplementary encryption context */ -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr, const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp); /** * Decrypts an AEAD block, an in parallel, optionally encrypts one block using AES-ECB. Returns if decryption was successful. @@ -74,7 +74,7 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, * @param aadlen size of AAD * @param tag the AEAD tag being received from peer */ -int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr, const void *aad, size_t aadlen, const void *tag); extern ptls_cipher_algorithm_t ptls_fusion_aes128ctr; diff --git a/lib/cifra/aes-common.h b/lib/cifra/aes-common.h index eb1007c69..0c393c57b 100644 --- a/lib/cifra/aes-common.h +++ b/lib/cifra/aes-common.h @@ -20,6 +20,7 @@ * IN THE SOFTWARE. */ #include +#include #include "aes.h" #include "modes.h" #include "sha2.h" @@ -51,13 +52,13 @@ static inline void aesecb_decrypt(ptls_cipher_context_t *_ctx, void *output, con cf_aes_decrypt(&ctx->aes, input, output); } -static inline int aesecb_setup_crypto(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size) +static inline int aesecb_setup_crypto(ptls_cipher_context_t *_ctx, int is_enc, const void *key) { struct aesecb_context_t *ctx = (struct aesecb_context_t *)_ctx; ctx->super.do_dispose = aesecb_dispose; ctx->super.do_init = NULL; ctx->super.do_transform = is_enc ? aesecb_encrypt : aesecb_decrypt; - cf_aes_init(&ctx->aes, key, key_size); + cf_aes_init(&ctx->aes, key, ctx->super.algo->key_size); return 0; } @@ -85,13 +86,13 @@ static inline void aesctr_transform(ptls_cipher_context_t *_ctx, void *output, c cf_ctr_cipher(&ctx->ctr, input, output, len); } -static inline int aesctr_setup_crypto(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size) +static inline int aesctr_setup_crypto(ptls_cipher_context_t *_ctx, int is_enc, const void *key) { struct aesctr_context_t *ctx = (struct aesctr_context_t *)_ctx; ctx->super.do_dispose = aesctr_dispose; ctx->super.do_init = aesctr_init; ctx->super.do_transform = aesctr_transform; - cf_aes_init(&ctx->aes, key, key_size); + cf_aes_init(&ctx->aes, key, ctx->super.algo->key_size); return 0; } @@ -99,6 +100,7 @@ struct aesgcm_context_t { ptls_aead_context_t super; cf_aes_context aes; cf_gcm_ctx gcm; + uint8_t static_iv[PTLS_AESGCM_IV_SIZE]; }; static inline void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) @@ -109,10 +111,12 @@ static inline void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) ptls_clear_memory((uint8_t *)ctx + sizeof(ctx->super), sizeof(*ctx) - sizeof(ctx->super)); } -static inline void aesgcm_encrypt_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) +static inline void aesgcm_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) { struct aesgcm_context_t *ctx = (struct aesgcm_context_t *)_ctx; + uint8_t iv[PTLS_AES_BLOCK_SIZE]; + ptls_aead__build_iv(ctx->super.algo, iv, ctx->static_iv, seq); cf_gcm_encrypt_init(&cf_aes, &ctx->aes, &ctx->gcm, aad, aadlen, iv, PTLS_AESGCM_IV_SIZE); } @@ -132,15 +136,17 @@ static inline size_t aesgcm_encrypt_final(ptls_aead_context_t *_ctx, void *outpu return PTLS_AESGCM_TAG_SIZE; } -static inline size_t aesgcm_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, +static inline size_t aesgcm_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { struct aesgcm_context_t *ctx = (struct aesgcm_context_t *)_ctx; + uint8_t iv[PTLS_AES_BLOCK_SIZE]; if (inlen < PTLS_AESGCM_TAG_SIZE) return SIZE_MAX; size_t tag_offset = inlen - PTLS_AESGCM_TAG_SIZE; + ptls_aead__build_iv(ctx->super.algo, iv, ctx->static_iv, seq); if (cf_gcm_decrypt(&cf_aes, &ctx->aes, input, tag_offset, aad, aadlen, iv, PTLS_AESGCM_IV_SIZE, (uint8_t *)input + tag_offset, PTLS_AESGCM_TAG_SIZE, output) != 0) return SIZE_MAX; @@ -148,7 +154,7 @@ static inline size_t aesgcm_decrypt(ptls_aead_context_t *_ctx, void *output, con return tag_offset; } -static inline int aead_aesgcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key, size_t key_size) +static inline int aead_aesgcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv) { struct aesgcm_context_t *ctx = (struct aesgcm_context_t *)_ctx; @@ -166,6 +172,7 @@ static inline int aead_aesgcm_setup_crypto(ptls_aead_context_t *_ctx, int is_enc ctx->super.do_decrypt = aesgcm_decrypt; } - cf_aes_init(&ctx->aes, key, key_size); + cf_aes_init(&ctx->aes, key, ctx->super.algo->key_size); + memcpy(ctx->static_iv, iv, sizeof(ctx->static_iv)); return 0; } diff --git a/lib/cifra/aes128.c b/lib/cifra/aes128.c index 31a25985e..7f6817ff7 100644 --- a/lib/cifra/aes128.c +++ b/lib/cifra/aes128.c @@ -23,17 +23,17 @@ static int aes128ecb_setup_crypto(ptls_cipher_context_t *ctx, int is_enc, const void *key) { - return aesecb_setup_crypto(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); + return aesecb_setup_crypto(ctx, is_enc, key); } static int aes128ctr_setup_crypto(ptls_cipher_context_t *ctx, int is_enc, const void *key) { - return aesctr_setup_crypto(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); + return aesctr_setup_crypto(ctx, is_enc, key); } -static int aead_aes128gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key) +static int aead_aes128gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) { - return aead_aesgcm_setup_crypto(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); + return aead_aesgcm_setup_crypto(ctx, is_enc, key, iv); } ptls_define_hash(sha256, cf_sha256_context, cf_sha256_init, cf_sha256_update, cf_sha256_digest_final); diff --git a/lib/cifra/aes256.c b/lib/cifra/aes256.c index b2cec2c86..5b925be8d 100644 --- a/lib/cifra/aes256.c +++ b/lib/cifra/aes256.c @@ -23,17 +23,17 @@ static int aes256ecb_setup_crypto(ptls_cipher_context_t *ctx, int is_enc, const void *key) { - return aesecb_setup_crypto(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); + return aesecb_setup_crypto(ctx, is_enc, key); } static int aes256ctr_setup_crypto(ptls_cipher_context_t *ctx, int is_enc, const void *key) { - return aesctr_setup_crypto(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); + return aesctr_setup_crypto(ctx, is_enc, key); } -static int aead_aes256gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key) +static int aead_aes256gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) { - return aead_aesgcm_setup_crypto(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); + return aead_aesgcm_setup_crypto(ctx, is_enc, key, iv); } ptls_define_hash(sha384, cf_sha512_context, cf_sha384_init, cf_sha384_update, cf_sha384_digest_final); diff --git a/lib/cifra/chacha20.c b/lib/cifra/chacha20.c index 9451c6526..a5f302fa8 100644 --- a/lib/cifra/chacha20.c +++ b/lib/cifra/chacha20.c @@ -67,6 +67,7 @@ static int chacha20_setup_crypto(ptls_cipher_context_t *_ctx, int is_enc, const struct chacha20poly1305_context_t { ptls_aead_context_t super; uint8_t key[PTLS_CHACHA20_KEY_SIZE]; + uint8_t static_iv[PTLS_CHACHA20POLY1305_IV_SIZE]; cf_chacha20_ctx chacha; cf_poly1305 poly; size_t aadlen; @@ -102,14 +103,14 @@ static void chacha20poly1305_finalize(struct chacha20poly1305_context_t *ctx, ui cf_poly1305_finish(&ctx->poly, tag); } -static void chacha20poly1305_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) +static void chacha20poly1305_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) { struct chacha20poly1305_context_t *ctx = (struct chacha20poly1305_context_t *)_ctx; uint8_t tmpbuf[64]; /* init chacha */ memset(tmpbuf, 0, 16 - PTLS_CHACHA20POLY1305_IV_SIZE); - memcpy(tmpbuf + 16 - PTLS_CHACHA20POLY1305_IV_SIZE, iv, PTLS_CHACHA20POLY1305_IV_SIZE); + ptls_aead__build_iv(ctx->super.algo, tmpbuf + 16 - PTLS_CHACHA20POLY1305_IV_SIZE, ctx->static_iv, seq); cf_chacha20_init_custom(&ctx->chacha, ctx->key, sizeof(ctx->key), tmpbuf, 4); /* init poly1305 (by using first 16 bytes of the key stream of the first block) */ @@ -149,7 +150,7 @@ static size_t chacha20poly1305_encrypt_final(ptls_aead_context_t *_ctx, void *ou return PTLS_CHACHA20POLY1305_TAG_SIZE; } -static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, +static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { struct chacha20poly1305_context_t *ctx = (struct chacha20poly1305_context_t *)_ctx; @@ -159,7 +160,7 @@ static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, if (inlen < sizeof(tag)) return SIZE_MAX; - chacha20poly1305_init(&ctx->super, iv, aad, aadlen); + chacha20poly1305_init(&ctx->super, seq, aad, aadlen); cf_poly1305_update(&ctx->poly, input, inlen - sizeof(tag)); ctx->textlen = inlen - sizeof(tag); @@ -178,7 +179,7 @@ static size_t chacha20poly1305_decrypt(ptls_aead_context_t *_ctx, void *output, return ret; } -static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key) +static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv) { struct chacha20poly1305_context_t *ctx = (struct chacha20poly1305_context_t *)_ctx; @@ -197,6 +198,7 @@ static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *_ctx, int is_ } memcpy(ctx->key, key, sizeof(ctx->key)); + memcpy(ctx->static_iv, iv, sizeof(ctx->static_iv)); return 0; } diff --git a/lib/fusion.c b/lib/fusion.c index 12d5f9cd1..78151edb1 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -65,16 +65,18 @@ struct ctr_context { struct aesgcm_context { ptls_aead_context_t super; ptls_fusion_aesgcm_context_t *aesgcm; + /** + * retains the static IV in the upper 96 bits (in little endian) + */ + __m128i static_iv; }; static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; #define poly (*(__m128i *)poly_) static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; #define bswap8 (*(__m128i *)bswap8_) -static const uint8_t bswap64_[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; -#define bswap64 (*(__m128i *)bswap64_) -static const uint8_t one64_[16] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0, 1}; -#define one64 (*(__m128i *)one64_) +static const uint8_t one8_[16] __attribute__((aligned(16))) = {1}; +#define one8 (*(__m128i *)one8_) /* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl * at commit 33388b4. */ @@ -217,25 +219,25 @@ static inline void storen(void *_p, size_t l, __m128i v) p[i] = buf[i]; } -void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr, const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ #define AESECB6_INIT() \ do { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits0 = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits1 = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits2 = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits3 = _mm_shuffle_epi8(ctr, bswap64); \ - ctr = _mm_add_epi64(ctr, one64); \ - bits4 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits0 = _mm_shuffle_epi8(ctr, bswap8); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits1 = _mm_shuffle_epi8(ctr, bswap8); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits2 = _mm_shuffle_epi8(ctr, bswap8); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits3 = _mm_shuffle_epi8(ctr, bswap8); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits4 = _mm_shuffle_epi8(ctr, bswap8); \ if (PTLS_LIKELY(srclen > 16 * 5)) { \ - ctr = _mm_add_epi64(ctr, one64); \ - bits5 = _mm_shuffle_epi8(ctr, bswap64); \ + ctr = _mm_add_epi64(ctr, one8); \ + bits5 = _mm_shuffle_epi8(ctr, bswap8); \ } else { \ if ((state & STATE_EK0_BEEN_FED) == 0) { \ bits5 = ek0; \ @@ -280,11 +282,11 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, bits5 = _mm_aesenclast_si128(bits5, k); \ } while (0) - __m128i ctr, ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); + __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); const __m128i *bits4keys = ctx->ecb.keys; /* is changed to supp->ctx.keys when calcurating suppout */ struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; - __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8); const __m128i *gdata; // points to the elements fed into GHASH size_t gdata_cnt; @@ -307,9 +309,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, int32_t state = supp != NULL ? STATE_SUPP_USED : 0; /* build counter */ - ek0 = _mm_loadu_si128(iv); - ek0 = _mm_insert_epi32(ek0, 0x1000000, 3); - ctr = _mm_shuffle_epi8(ek0, bswap64); + ctr = _mm_insert_epi32(ctr, 1, 0); + ek0 = _mm_shuffle_epi8(ctr, bswap8); /* prepare the first bit stream */ AESECB6_INIT(); @@ -461,14 +462,14 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, #undef STATE_SUPP_IN_PROCESS } -int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, const void *iv, +int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr, const void *_aad, size_t aadlen, const void *tag) { - __m128i ctr, ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), - bits3 = _mm_setzero_si128(), bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); + __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(), + bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); struct ptls_fusion_gfmul_state gstate = {}; __m128i gdatabuf[6]; - __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)inlen * 8, 0, (int)aadlen * 8), bswap64); + __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8); struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1; const __m128i *gdata; // points to the elements fed into GHASH @@ -478,13 +479,9 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, __m128i *dst = output; size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen; - /* build counter */ - ctr = loadn(iv, PTLS_AESGCM_IV_SIZE); - ctr = _mm_shuffle_epi8(ctr, bswap64); - /* schedule ek0 and suppkey */ - ctr = _mm_add_epi64(ctr, one64); - bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), ctx->ecb.keys[0]); + ctr = _mm_add_epi64(ctr, one8); + bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap8), ctx->ecb.keys[0]); ++nondata_aes_cnt; #define STATE_IS_FIRST_RUN 0x1 @@ -544,8 +541,8 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, switch (nondata_aes_cnt) { #define INIT_BITS(n, keys) \ case n: \ - ctr = _mm_add_epi64(ctr, one64); \ - bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap64), keys[0]); + ctr = _mm_add_epi64(ctr, one8); \ + bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap8), keys[0]); InitAllBits: INIT_BITS(0, ctx->ecb.keys); INIT_BITS(1, ctx->ecb.keys); @@ -769,7 +766,7 @@ static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) ptls_fusion_aesgcm_destroy(ctx->aesgcm); } -static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) +static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) { assert(!"FIXME"); } @@ -786,40 +783,51 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) return SIZE_MAX; } -void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, +static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq) +{ + __m128i ctr = _mm_setzero_si128(); + ctr = _mm_insert_epi64(ctr, seq, 0); + ctr = _mm_slli_si128(ctr, 32); + ctr = _mm_xor_si128(ctx->static_iv, ctr); + return ctr; +} + +void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) { - ptls_fusion_aesgcm_encrypt(((struct aesgcm_context *)_ctx)->aesgcm, output, input, inlen, iv, aad, aadlen, supp); + struct aesgcm_context *ctx = (void *)_ctx; + + ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp); } -static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, const void *iv, +static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { - ptls_fusion_aesgcm_context_t *aesgcm = ((struct aesgcm_context *)_ctx)->aesgcm; + struct aesgcm_context *ctx = (void *)_ctx; + + if (inlen < 16) + return SIZE_MAX; size_t enclen = inlen - 16; - if (!ptls_fusion_aesgcm_decrypt(aesgcm, output, input, enclen, iv, aad, aadlen, (const uint8_t *)input + enclen)) + if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen, + (const uint8_t *)input + enclen)) return SIZE_MAX; return enclen; } -static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key) +static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; + ctx->static_iv = loadn(iv, PTLS_AESGCM_IV_SIZE); + ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, bswap8); + ctx->super.dispose_crypto = aesgcm_dispose_crypto; - if (is_enc) { - ctx->super.do_encrypt_init = aead_do_encrypt_init; - ctx->super.do_encrypt_update = aead_do_encrypt_update; - ctx->super.do_encrypt_final = aead_do_encrypt_final; - ctx->super.do_encrypt = aead_do_encrypt; - ctx->super.do_decrypt = NULL; - } else { - ctx->super.do_encrypt_init = NULL; - ctx->super.do_encrypt_update = NULL; - ctx->super.do_encrypt_final = NULL; - ctx->super.do_decrypt = aead_do_decrypt; - } + ctx->super.do_encrypt_init = aead_do_encrypt_init; + ctx->super.do_encrypt_update = aead_do_encrypt_update; + ctx->super.do_encrypt_final = aead_do_encrypt_final; + ctx->super.do_encrypt = aead_do_encrypt; + ctx->super.do_decrypt = aead_do_decrypt; ctx->aesgcm = ptls_fusion_aesgcm_create(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ diff --git a/lib/openssl.c b/lib/openssl.c index f494a3ec5..4bf92aecf 100644 --- a/lib/openssl.c +++ b/lib/openssl.c @@ -768,6 +768,7 @@ static int bfecb_setup_crypto(ptls_cipher_context_t *ctx, int is_enc, const void struct aead_crypto_context_t { ptls_aead_context_t super; EVP_CIPHER_CTX *evp_ctx; + uint8_t static_iv[PTLS_MAX_IV_SIZE]; }; static void aead_dispose_crypto(ptls_aead_context_t *_ctx) @@ -778,12 +779,13 @@ static void aead_dispose_crypto(ptls_aead_context_t *_ctx) EVP_CIPHER_CTX_free(ctx->evp_ctx); } -static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, const void *iv, const void *aad, size_t aadlen) +static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) { struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; + uint8_t iv[PTLS_MAX_IV_SIZE]; int ret; - /* FIXME for performance, preserve the expanded key instead of the raw key */ + ptls_aead__build_iv(ctx->super.algo, iv, ctx->static_iv, seq); ret = EVP_EncryptInit_ex(ctx->evp_ctx, NULL, NULL, NULL, iv); assert(ret); @@ -822,17 +824,18 @@ static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) return off; } -static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, const void *iv, +static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const void *input, size_t inlen, uint64_t seq, const void *aad, size_t aadlen) { struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; - uint8_t *output = _output; + uint8_t *output = _output, iv[PTLS_MAX_IV_SIZE]; size_t off = 0, tag_size = ctx->super.algo->tag_size; int blocklen, ret; if (inlen < tag_size) return SIZE_MAX; + ptls_aead__build_iv(ctx->super.algo, iv, ctx->static_iv, seq); ret = EVP_DecryptInit_ex(ctx->evp_ctx, NULL, NULL, NULL, iv); assert(ret); if (aadlen != 0) { @@ -851,7 +854,7 @@ static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *_output, const vo return off; } -static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key, const EVP_CIPHER *cipher) +static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, const EVP_CIPHER *cipher) { struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; int ret; @@ -890,6 +893,7 @@ static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void * ret = PTLS_ERROR_LIBRARY; goto Error; } + memcpy(ctx->static_iv, iv, ctx->super.algo->iv_size); return 0; @@ -898,20 +902,20 @@ static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void * return ret; } -static int aead_aes128gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key) +static int aead_aes128gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) { - return aead_setup_crypto(ctx, is_enc, key, EVP_aes_128_gcm()); + return aead_setup_crypto(ctx, is_enc, key, iv, EVP_aes_128_gcm()); } -static int aead_aes256gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key) +static int aead_aes256gcm_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) { - return aead_setup_crypto(ctx, is_enc, key, EVP_aes_256_gcm()); + return aead_setup_crypto(ctx, is_enc, key, iv, EVP_aes_256_gcm()); } #if PTLS_OPENSSL_HAVE_CHACHA20_POLY1305 -static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key) +static int aead_chacha20poly1305_setup_crypto(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) { - return aead_setup_crypto(ctx, is_enc, key, EVP_chacha20_poly1305()); + return aead_setup_crypto(ctx, is_enc, key, iv, EVP_chacha20_poly1305()); } #endif diff --git a/lib/picotls.c b/lib/picotls.c index ec287cf16..13611aaea 100644 --- a/lib/picotls.c +++ b/lib/picotls.c @@ -5125,10 +5125,8 @@ ptls_aead_context_t *ptls_aead_new_direct(ptls_aead_algorithm_t *aead, int is_en return NULL; *ctx = (ptls_aead_context_t){aead}; - memcpy(ctx->static_iv, iv, aead->iv_size); - if (aead->setup_crypto(ctx, is_enc, key) != 0) { - ptls_clear_memory(ctx->static_iv, aead->iv_size); + if (aead->setup_crypto(ctx, is_enc, key, iv) != 0) { free(ctx); return NULL; } @@ -5139,23 +5137,13 @@ ptls_aead_context_t *ptls_aead_new_direct(ptls_aead_algorithm_t *aead, int is_en void ptls_aead_free(ptls_aead_context_t *ctx) { ctx->dispose_crypto(ctx); - ptls_clear_memory(ctx->static_iv, ctx->algo->iv_size); free(ctx); } -void ptls_aead_encrypt_s(ptls_aead_context_t *ctx, void *output, const void *input, size_t inlen, uint64_t seq, const void *aad, - size_t aadlen, ptls_aead_supplementary_encryption_t *supp) +void ptls_aead__build_iv(ptls_aead_algorithm_t *algo, uint8_t *iv, const uint8_t *static_iv, uint64_t seq) { - uint8_t iv[PTLS_MAX_IV_SIZE]; - - ptls_aead__build_iv(ctx, iv, seq); - ctx->do_encrypt(ctx, output, input, inlen, iv, aad, aadlen, supp); -} - -void ptls_aead__build_iv(ptls_aead_context_t *ctx, uint8_t *iv, uint64_t seq) -{ - size_t iv_size = ctx->algo->iv_size, i; - const uint8_t *s = ctx->static_iv; + size_t iv_size = algo->iv_size, i; + const uint8_t *s = static_iv; uint8_t *d = iv; /* build iv */ diff --git a/t/fusion.c b/t/fusion.c index d23103bb9..dbe0b6e7b 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -57,11 +57,11 @@ int main(int argc, char **argv) ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, zero, "hello", 5, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, _mm_setzero_si128(), "hello", 5, NULL); ok(memcmp(expected, encrypted, sizeof(expected)) == 0); memset(decrypted, 0x55, sizeof(decrypted)); - ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, zero, "hello", 5, expected + 16)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, _mm_setzero_si128(), "hello", 5, expected + 16)); ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); ptls_fusion_aesgcm_destroy(ctx); @@ -72,9 +72,9 @@ int main(int argc, char **argv) 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); uint8_t encrypted[17], decrypted[1] = {0x55}; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, zero, "a", 1, NULL); + ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, _mm_setzero_si128(), "a", 1, NULL); ok(memcmp(expected, encrypted, 17) == 0); - ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, zero, "a", 1, expected + 1)); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, _mm_setzero_si128(), "a", 1, expected + 1)); ok('X' == decrypted[0]); ptls_fusion_aesgcm_destroy(ctx); } @@ -85,42 +85,42 @@ int main(int argc, char **argv) for (int i = 0; i < 2; ++i) { uint8_t encrypted[sizeof(zero) + 16], decrypted[sizeof(zero)]; -#define DOIT(iv, aad, aadlen, ptlen, expected_tag, expected_supp) \ +#define DOIT(aad, aadlen, ptlen, expected_tag, expected_supp) \ do { \ memset(encrypted, 0xcc, sizeof(encrypted)); \ - ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, iv, aad, aadlen, supp); \ + ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, _mm_setzero_si128(), aad, aadlen, supp); \ printf("%s\n", tostr(encrypted + ptlen, 16)); \ ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ if (supp != NULL) \ ok(strcmp(tostr(supp->output, sizeof(supp->output)), expected_supp) == 0); \ memset(decrypted, 0x55, sizeof(decrypted)); \ - ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, ptlen, iv, aad, aadlen, encrypted + ptlen)); \ + ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, ptlen, _mm_setzero_si128(), aad, aadlen, encrypted + ptlen)); \ ok(memcmp(decrypted, zero, ptlen) == 0); \ } while (0) - DOIT(zero, zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf", "4576f18ef3ae9dfd37cf72c4592da874"); - DOIT(zero, zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 64, "66165d39739c50c90727e7d49127146b", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 96, "6b9c468e24ed96010687f3880a044d42", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf", "4576f18ef3ae9dfd37cf72c4592da874"); + DOIT(zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 64, "66165d39739c50c90727e7d49127146b", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 96, "6b9c468e24ed96010687f3880a044d42", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 22, 1328, "0507baaece8d573774c94e8103821316", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 21, 1329, "dd70d59030eadb6313e778046540a253", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 20, 1330, "f1b456b955afde7603188af0124a32ef", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 22, 1328, "0507baaece8d573774c94e8103821316", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 21, 1329, "dd70d59030eadb6313e778046540a253", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 20, 1330, "f1b456b955afde7603188af0124a32ef", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 0, 80, "98885a3a22bd4742fe7b72172193b163", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 0, 80, "98885a3a22bd4742fe7b72172193b163", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, zero, 20, 85, "afe8b727057c804a0525c2914ef856b0", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 20, 85, "afe8b727057c804a0525c2914ef856b0", "a062016e90dcc316d061fde5424cf34f"); #undef DOIT @@ -148,15 +148,14 @@ int main(int argc, char **argv) 0x77, 0x08, 0xa9, 0x60, 0x17, 0x73, 0xc5, 0x07, 0xf3, 0x04, 0xc9, 0x3f, 0x67, 0x4d, 0x12, 0xa1, 0x02, 0x93, 0xc2, 0x3c, 0xd3, 0xf8, 0x59, 0x33, 0xd5, 0x01, 0xc3, 0xbb, 0xaa, 0xe6, 0x3f, 0xbb, 0x23, 0x66, 0x94, 0x26, 0x28, 0x43, 0xa5, 0xfd, 0x2f}; - ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(key, sizeof(aad) + sizeof(plaintext)); + ptls_aead_context_t *aead = ptls_aead_new_direct(&ptls_fusion_aes128gcm, 0, key, iv); uint8_t encrypted[sizeof(plaintext) + 16], decrypted[sizeof(plaintext)]; - ptls_fusion_aesgcm_encrypt(aead, encrypted, plaintext, sizeof(plaintext), iv, aad, sizeof(aad), NULL); + ptls_aead_encrypt(aead, encrypted, plaintext, sizeof(plaintext), 0, aad, sizeof(aad)); ok(memcmp(expected, encrypted, sizeof(plaintext)) == 0); ok(memcmp(expected + sizeof(plaintext), encrypted + sizeof(plaintext), 16) == 0); - ok(ptls_fusion_aesgcm_decrypt(aead, decrypted, encrypted, sizeof(plaintext), iv, aad, sizeof(aad), - encrypted + sizeof(plaintext))); + ok(ptls_aead_decrypt(aead, decrypted, encrypted, sizeof(encrypted), 0, aad, sizeof(aad)) == sizeof(plaintext)); ok(memcmp(decrypted, plaintext, sizeof(plaintext)) == 0); - ptls_fusion_aesgcm_destroy(aead); + ptls_aead_free(aead); } return done_testing(); diff --git a/t/fusionbench.c b/t/fusionbench.c index f2f35aa53..beba92465 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -7,7 +7,7 @@ int main(int argc, char **argv) { - static const uint8_t key[16] = {}, iv[12] = {}, aad[13] = {}; + static const uint8_t key[16] = {}, aad[13] = {}; size_t textlen = 16384; ptls_aead_supplementary_encryption_t *supp = NULL; int ch, decrypt = 0, count = 1000000; @@ -57,11 +57,11 @@ int main(int argc, char **argv) if (!decrypt) { for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), supp); + ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, _mm_setzero_si128(), aad, sizeof(aad), supp); } else { uint8_t tag[16] = {}; for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, iv, aad, sizeof(aad), &tag); + ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, _mm_setzero_si128(), aad, sizeof(aad), &tag); } for (int i = 0; i < 16; ++i) From 076982f295d4e4a64f5da1126398de87de89d68a Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 09:28:44 +0900 Subject: [PATCH 40/60] oops, argument to slli is in bytes --- lib/fusion.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fusion.c b/lib/fusion.c index 78151edb1..979469b41 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -787,7 +787,7 @@ static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq) { __m128i ctr = _mm_setzero_si128(); ctr = _mm_insert_epi64(ctr, seq, 0); - ctr = _mm_slli_si128(ctr, 32); + ctr = _mm_slli_si128(ctr, 4); ctr = _mm_xor_si128(ctx->static_iv, ctr); return ctr; } From 6d1eaab5dfabb8c86bf8e679126cd8cf8d20d205 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 16:23:21 +0900 Subject: [PATCH 41/60] set `-mavx2` as well --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 275a6d7f2..6ea4185e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,14 +137,14 @@ IF ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")) MESSAGE(STATUS " Enabling fusion support") ADD_LIBRARY(picotls-fusion lib/fusion.c) - SET_TARGET_PROPERTIES(picotls-fusion PROPERTIES COMPILE_FLAGS "-maes -mpclmul") + SET_TARGET_PROPERTIES(picotls-fusion PROPERTIES COMPILE_FLAGS "-mavx2 -maes -mpclmul") TARGET_LINK_LIBRARIES(picotls-fusion picotls-core) ADD_EXECUTABLE(test-fusion.t deps/picotest/picotest.c lib/picotls.c lib/fusion.c t/fusion.c) - SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-maes -mpclmul") + SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-mavx2 -maes -mpclmul") SET(TEST_EXES ${TEST_EXES} test-fusion.t) ENDIF () From 9c230ef959378a63a25579891874c18c50758444 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Thu, 14 May 2020 16:47:26 +0900 Subject: [PATCH 42/60] create dependency --- CMakeLists.txt | 1 + cmake/dtrace-utils.cmake | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ea4185e4..fc3cb0045 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,7 @@ IF ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND lib/fusion.c t/fusion.c) SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-mavx2 -maes -mpclmul") + ADD_DEPENDENCIES(test-fusion.t generate-picotls-probes) SET(TEST_EXES ${TEST_EXES} test-fusion.t) ENDIF () diff --git a/cmake/dtrace-utils.cmake b/cmake/dtrace-utils.cmake index da566461f..191953174 100644 --- a/cmake/dtrace-utils.cmake +++ b/cmake/dtrace-utils.cmake @@ -27,10 +27,14 @@ FUNCTION (DEFINE_DTRACE_DEPENDENCIES d_file prefix) OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.h COMMAND dtrace -o ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.h -s ${d_file} -h DEPENDS ${d_file}) + ADD_CUSTOM_TARGET(generate-${prefix}-probes DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.h) + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.h PROPERTIES GENERATED TRUE) IF (DTRACE_USES_OBJFILE) ADD_CUSTOM_COMMAND( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.o COMMAND dtrace -o ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.o -s ${d_file} -G DEPENDS ${d_file}) + ADD_DEPENDENCIES(generate-${prefix}-probes ${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.o) + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_BINARY_DIR}/${prefix}-probes.o PROPERTIES GENERATED TRUE) ENDIF () ENDFUNCTION () From 3ee790b75506b9d8775e2fa3b4b29df8babb19af Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 03:35:03 +0900 Subject: [PATCH 43/60] check CPU features --- include/picotls/fusion.h | 5 +++++ lib/fusion.c | 12 ++++++++++++ t/fusion.c | 5 +++++ 3 files changed, 22 insertions(+) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index fede03d57..d7da45ecc 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -80,6 +80,11 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, extern ptls_cipher_algorithm_t ptls_fusion_aes128ctr; extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; +/** + * Returns a boolean indicating if fusion can be used. + */ +int ptls_fusion_is_supported_by_cpu(void); + #ifdef __cplusplus } #endif diff --git a/lib/fusion.c b/lib/fusion.c index 979469b41..d1f074aad 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -848,3 +848,15 @@ ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", PTLS_AESGCM_TAG_SIZE, sizeof(struct aesgcm_context), aes128gcm_setup}; + +int ptls_fusion_is_supported_by_cpu(void) +{ +#define REQUIRE(s) \ + if (!__builtin_cpu_supports(s)) \ + return 0; + REQUIRE("avx2"); + REQUIRE("aes"); + REQUIRE("pclmul"); +#undef REQUIRE + return 1; +} diff --git a/t/fusion.c b/t/fusion.c index dbe0b6e7b..a804e835e 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -48,6 +48,11 @@ static const char *tostr(const void *_p, size_t len) int main(int argc, char **argv) { + if (!ptls_fusion_is_supported_by_cpu()) { + note("CPU does have the necessary features (avx2, aes, pclmul)\n"); + return done_testing(); + } + static const uint8_t zero[16384] = {}, one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; { From 3604f8bef6f723f076c15e35024267cea1cea5d7 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 04:24:27 +0900 Subject: [PATCH 44/60] old versions of GCC (e.g. 5.4) cannot detect support for aes,pclmul --- lib/fusion.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index d1f074aad..c121bee3e 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -849,14 +850,30 @@ ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", sizeof(struct aesgcm_context), aes128gcm_setup}; +#include + int ptls_fusion_is_supported_by_cpu(void) { -#define REQUIRE(s) \ - if (!__builtin_cpu_supports(s)) \ + unsigned leaf1_ecx, leaf7_ebx; + + { /* GCC-specific code to obtain CPU features */ + unsigned unused1, unused2, unused3; + if (!__get_cpuid(1, &unused1, &unused2, &leaf1_ecx, &unused3)) + return 0; + if (!__get_cpuid_count(7, 0, &unused1, &leaf7_ebx, &unused2, &unused3)) + return 0; + } + + + /* AVX2 */ + if ((leaf7_ebx & (1 << 5)) == 0) return 0; - REQUIRE("avx2"); - REQUIRE("aes"); - REQUIRE("pclmul"); -#undef REQUIRE + /* AES */ + if ((leaf1_ecx & (1 << 25)) == 0) + return 0; + /* PCLMUL */ + if ((leaf1_ecx & (1 << 1)) == 0) + return 0; + return 1; } From efce043e7695ff24916d58442ebbbfb6eb65c772 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 04:51:58 +0900 Subject: [PATCH 45/60] __get_cpuid_count is also unavailable on older versions of GCC --- lib/fusion.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index c121bee3e..43bc3367b 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -850,21 +850,19 @@ ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", sizeof(struct aesgcm_context), aes128gcm_setup}; -#include - int ptls_fusion_is_supported_by_cpu(void) { unsigned leaf1_ecx, leaf7_ebx; { /* GCC-specific code to obtain CPU features */ - unsigned unused1, unused2, unused3; - if (!__get_cpuid(1, &unused1, &unused2, &leaf1_ecx, &unused3)) - return 0; - if (!__get_cpuid_count(7, 0, &unused1, &leaf7_ebx, &unused2, &unused3)) + unsigned leaf_cnt; + __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx"); + if (leaf_cnt < 7) return 0; + __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx"); + __asm__("cpuid" : "=b"(leaf7_ebx) : "a"(7), "c"(0) : "edx"); } - /* AVX2 */ if ((leaf7_ebx & (1 << 5)) == 0) return 0; From 31ebd7d81e7a5ac4b26f0ce8e360343ede0e70d0 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 06:37:23 +0900 Subject: [PATCH 46/60] new / free are the terms that we use --- include/picotls/fusion.h | 4 ++-- lib/fusion.c | 8 ++++---- t/fusion.c | 12 ++++++------ t/fusionbench.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index d7da45ecc..786df27a3 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -46,11 +46,11 @@ void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx); * @param key the AES key (128 bits) * @param max_size maximum size of the record (i.e. AAD + encrypted payload) */ -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size); +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t max_size); /** * Destroys an AES-GCM context. */ -void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx); +void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx); /** * Encrypts an AEAD block, and in parallel, optionally encrypts one block using AES-ECB. * @param ctx context diff --git a/lib/fusion.c b/lib/fusion.c index 43bc3367b..70b229ae3 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -683,7 +683,7 @@ void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) ptls_clear_memory(ctx, sizeof(*ctx)); } -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t max_size) +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t max_size) { ptls_fusion_aesgcm_context_t *ctx; size_t ghash_cnt = (max_size + 15) / 16 + 2; // round-up by block size, add to handle worst split of the size between AAD and @@ -710,7 +710,7 @@ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_create(const void *key, size_t return ctx; } -void ptls_fusion_aesgcm_destroy(ptls_fusion_aesgcm_context_t *ctx) +void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx) { ptls_clear_memory(ctx->ghash, sizeof(ctx->ghash[0]) * ctx->ghash_cnt); ctx->ghash_cnt = 0; @@ -764,7 +764,7 @@ static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; - ptls_fusion_aesgcm_destroy(ctx->aesgcm); + ptls_fusion_aesgcm_free(ctx->aesgcm); } static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) @@ -830,7 +830,7 @@ static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *ke ctx->super.do_encrypt = aead_do_encrypt; ctx->super.do_decrypt = aead_do_decrypt; - ctx->aesgcm = ptls_fusion_aesgcm_create(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ + ctx->aesgcm = ptls_fusion_aesgcm_new(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ return 0; } diff --git a/t/fusion.c b/t/fusion.c index a804e835e..34dff0daa 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -59,7 +59,7 @@ int main(int argc, char **argv) static const uint8_t expected[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78, 0x97, 0x3f, 0xbc, 0xa6, 0x54, 0x77, 0xbf, 0x47, 0x85, 0xb0, 0xd5, 0x61, 0xf7, 0xe3, 0xfd, 0x6c}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 5 + 16); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, _mm_setzero_si128(), "hello", 5, NULL); @@ -69,23 +69,23 @@ int main(int argc, char **argv) ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, _mm_setzero_si128(), "hello", 5, expected + 16)); ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); - ptls_fusion_aesgcm_destroy(ctx); + ptls_fusion_aesgcm_free(ctx); } { /* test capacity */ static const uint8_t expected[17] = {0x5b, 0x27, 0x21, 0x5e, 0xd8, 0x1a, 0x70, 0x2e, 0x39, 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(zero, 2); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, 2); uint8_t encrypted[17], decrypted[1] = {0x55}; ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, _mm_setzero_si128(), "a", 1, NULL); ok(memcmp(expected, encrypted, 17) == 0); ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, _mm_setzero_si128(), "a", 1, expected + 1)); ok('X' == decrypted[0]); - ptls_fusion_aesgcm_destroy(ctx); + ptls_fusion_aesgcm_free(ctx); } { - ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_create(zero, sizeof(zero)); + ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_new(zero, sizeof(zero)); ptls_aead_supplementary_encryption_t *supp = NULL; for (int i = 0; i < 2; ++i) { @@ -136,7 +136,7 @@ int main(int argc, char **argv) ptls_cipher_free(supp->ctx); free(supp); - ptls_fusion_aesgcm_destroy(aead); + ptls_fusion_aesgcm_free(aead); } { diff --git a/t/fusionbench.c b/t/fusionbench.c index beba92465..b7a7d3b02 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -53,7 +53,7 @@ int main(int argc, char **argv) if (supp != NULL) supp->input = textlen >= 2 ? text + 2 : text + textlen; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_create(key, sizeof(aad) + textlen); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(key, sizeof(aad) + textlen); if (!decrypt) { for (int i = 0; i < count; ++i) From 4c19f5045de550152db86df1b4e9ad012ae0ced2 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 08:30:35 +0900 Subject: [PATCH 47/60] AES256 --- include/picotls/fusion.h | 11 +++-- lib/fusion.c | 90 +++++++++++++++++++++++++++++----------- t/fusion.c | 21 ++++++++-- t/fusionbench.c | 2 +- 4 files changed, 91 insertions(+), 33 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 786df27a3..85cdc0db0 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -30,23 +30,26 @@ extern "C" { #include #include "../picotls.h" -#define PTLS_FUSION_AES_ROUNDS 10 /* TODO support AES256 */ +#define PTLS_FUSION_AES128_ROUNDS 10 +#define PTLS_FUSION_AES256_ROUNDS 14 typedef struct ptls_fusion_aesecb_context { - __m128i keys[PTLS_FUSION_AES_ROUNDS + 1]; + __m128i keys[PTLS_FUSION_AES256_ROUNDS + 1]; + unsigned rounds; } ptls_fusion_aesecb_context_t; typedef struct ptls_fusion_aesgcm_context ptls_fusion_aesgcm_context_t; -void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key); +void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size); void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx); +void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src); /** * Creates an AES-GCM context. * @param key the AES key (128 bits) * @param max_size maximum size of the record (i.e. AAD + encrypted payload) */ -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t max_size); +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t max_size); /** * Destroys an AES-GCM context. */ diff --git a/lib/fusion.c b/lib/fusion.c index 70b229ae3..8be2bcb88 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -187,7 +187,7 @@ static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i size_t i; v = _mm_xor_si128(v, ctx->keys[0]); - for (i = 1; i < PTLS_FUSION_AES_ROUNDS; ++i) + for (i = 1; i < ctx->rounds; ++i) v = _mm_aesenc_si128(v, ctx->keys[i]); v = _mm_aesenclast_si128(v, ctx->keys[i]); @@ -272,14 +272,14 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } while (0) /* aesenclast */ -#define AESECB6_FINAL() \ +#define AESECB6_FINAL(i) \ do { \ - __m128i k = ctx->ecb.keys[10]; \ + __m128i k = ctx->ecb.keys[i]; \ bits0 = _mm_aesenclast_si128(bits0, k); \ bits1 = _mm_aesenclast_si128(bits1, k); \ bits2 = _mm_aesenclast_si128(bits2, k); \ bits3 = _mm_aesenclast_si128(bits3, k); \ - bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); \ + bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \ bits5 = _mm_aesenclast_si128(bits5, k); \ } while (0) @@ -313,11 +313,13 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, ctr = _mm_insert_epi32(ctr, 1, 0); ek0 = _mm_shuffle_epi8(ctr, bswap8); - /* prepare the first bit stream */ - AESECB6_INIT(); - for (size_t i = 1; i < 10; ++i) - AESECB6_UPDATE(i); - AESECB6_FINAL(); + { /* prepare the first bit stream */ + size_t i; + AESECB6_INIT(); + for (i = 1; i < ctx->ecb.rounds; ++i) + AESECB6_UPDATE(i); + AESECB6_FINAL(i); + } /* the main loop */ while (1) { @@ -416,13 +418,14 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } /* run AES and multiplication in parallel */ - for (size_t i = 2; i <= 7; ++i) { + size_t i; + for (i = 2; i <= 7; ++i) { AESECB6_UPDATE(i); gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); } - AESECB6_UPDATE(8); - AESECB6_UPDATE(9); - AESECB6_FINAL(); + for (; i < ctx->ecb.rounds; ++i) + AESECB6_UPDATE(i); + AESECB6_FINAL(i); } Finish: @@ -450,8 +453,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } do { bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]); - } while (i != 10); - bits4 = _mm_aesenclast_si128(bits4, bits4keys[10]); + } while (i != ctx->ecb.rounds); + bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); _mm_storeu_si128((__m128i *)supp->output, bits4); } @@ -571,7 +574,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, AESECB6_UPDATE(aesi); gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); } - for (; aesi <= 9; ++aesi) + for (; aesi < ctx->ecb.rounds; ++aesi) AESECB6_UPDATE(aesi); __m128i k = ctx->ecb.keys[aesi]; bits0 = _mm_aesenclast_si128(bits0, k); @@ -646,24 +649,51 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, #undef STATE_GHASH_HAS_MORE } -static __m128i expand_key(__m128i key, __m128i t) +static __m128i expand_key(__m128i key, __m128i temp) { - t = _mm_shuffle_epi32(t, _MM_SHUFFLE(3, 3, 3, 3)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); - return _mm_xor_si128(key, t); + + key = _mm_xor_si128(key, temp); + + return key; } -void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key) +void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size) { + assert(is_enc && "decryption is not supported (yet)"); + size_t i = 0; + switch (key_size) { + case 16: /* AES128 */ + ctx->rounds = 10; + break; + case 32: /* AES256 */ + ctx->rounds = 14; + break; + default: + assert(!"invalid key size; AES128 / AES256 are supported"); + break; + } + ctx->keys[i++] = _mm_loadu_si128((__m128i *)key); + if (key_size == 32) + ctx->keys[i++] = _mm_loadu_si128((__m128i *)key + 1); + #define EXPAND(R) \ do { \ - ctx->keys[i] = expand_key(ctx->keys[i - 1], _mm_aeskeygenassist_si128(ctx->keys[i - 1], R)); \ + ctx->keys[i] = expand_key(ctx->keys[i - key_size / 16], \ + _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \ + if (i == ctx->rounds) \ + goto Done; \ ++i; \ + if (key_size > 24) { \ + ctx->keys[i] = expand_key(ctx->keys[i - key_size / 16], \ + _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \ + ++i; \ + } \ } while (0) EXPAND(0x1); EXPAND(0x2); @@ -676,6 +706,8 @@ void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, const void *key) EXPAND(0x1b); EXPAND(0x36); #undef EXPAND +Done: + assert(i == ctx->rounds); } void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) @@ -683,7 +715,14 @@ void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) ptls_clear_memory(ctx, sizeof(*ctx)); } -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t max_size) +void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src) +{ + __m128i v = _mm_loadu_si128(src); + v = aesecb_encrypt(ctx, v); + _mm_storeu_si128(dst, v); +} + +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t max_size) { ptls_fusion_aesgcm_context_t *ctx; size_t ghash_cnt = (max_size + 15) / 16 + 2; // round-up by block size, add to handle worst split of the size between AAD and @@ -692,7 +731,7 @@ ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t max if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) return NULL; - ptls_fusion_aesecb_init(&ctx->ecb, key); + ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size); ctx->ghash_cnt = ghash_cnt; ctx->ghash[0].H = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128()); @@ -754,7 +793,7 @@ static int aes128ctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void * ctx->super.do_dispose = ctr_dispose; ctx->super.do_init = ctr_init; ctx->super.do_transform = ctr_transform; - ptls_fusion_aesecb_init(&ctx->fusion, key); + ptls_fusion_aesecb_init(&ctx->fusion, 1, key, PTLS_AES128_KEY_SIZE); ctx->is_ready = 0; return 0; @@ -830,7 +869,8 @@ static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *ke ctx->super.do_encrypt = aead_do_encrypt; ctx->super.do_decrypt = aead_do_decrypt; - ctx->aesgcm = ptls_fusion_aesgcm_new(key, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ + ctx->aesgcm = ptls_fusion_aesgcm_new(key, PTLS_AES128_KEY_SIZE, + 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ return 0; } diff --git a/t/fusion.c b/t/fusion.c index 34dff0daa..f9ebd2d49 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -55,11 +55,26 @@ int main(int argc, char **argv) static const uint8_t zero[16384] = {}, one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + { + ptls_fusion_aesecb_context_t ecb; + uint8_t encrypted[16]; + + ptls_fusion_aesecb_init(&ecb, 1, zero, 16); + ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); + ptls_fusion_aesecb_dispose(&ecb); + ok(strcmp(tostr(encrypted, 16), "172afecb50b5f1237814b2f7cb51d0f7") == 0); + + ptls_fusion_aesecb_init(&ecb, 1, zero, 32); + ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); + ptls_fusion_aesecb_dispose(&ecb); + ok(strcmp(tostr(encrypted, 16), "2a033f0627b3554aa4fe5786550736ff") == 0); + } + { static const uint8_t expected[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78, 0x97, 0x3f, 0xbc, 0xa6, 0x54, 0x77, 0xbf, 0x47, 0x85, 0xb0, 0xd5, 0x61, 0xf7, 0xe3, 0xfd, 0x6c}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, 5 + 16); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, _mm_setzero_si128(), "hello", 5, NULL); @@ -75,7 +90,7 @@ int main(int argc, char **argv) { /* test capacity */ static const uint8_t expected[17] = {0x5b, 0x27, 0x21, 0x5e, 0xd8, 0x1a, 0x70, 0x2e, 0x39, 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, 2); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, 2); uint8_t encrypted[17], decrypted[1] = {0x55}; ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, _mm_setzero_si128(), "a", 1, NULL); ok(memcmp(expected, encrypted, 17) == 0); @@ -85,7 +100,7 @@ int main(int argc, char **argv) } { - ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_new(zero, sizeof(zero)); + ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, sizeof(zero)); ptls_aead_supplementary_encryption_t *supp = NULL; for (int i = 0; i < 2; ++i) { diff --git a/t/fusionbench.c b/t/fusionbench.c index b7a7d3b02..0599fefe0 100644 --- a/t/fusionbench.c +++ b/t/fusionbench.c @@ -53,7 +53,7 @@ int main(int argc, char **argv) if (supp != NULL) supp->input = textlen >= 2 ? text + 2 : text + textlen; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(key, sizeof(aad) + textlen); + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(key, sizeof(key), sizeof(aad) + textlen); if (!decrypt) { for (int i = 0; i < count; ++i) From 77f1b8b5bc1b239f750cc837718db8d26238bf59 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 11:06:49 +0900 Subject: [PATCH 48/60] organize tests --- t/fusion.c | 188 ++++++++++++++++++++++++++++------------------------- 1 file changed, 98 insertions(+), 90 deletions(-) diff --git a/t/fusion.c b/t/fusion.c index f9ebd2d49..8adc21bae 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -46,70 +46,90 @@ static const char *tostr(const void *_p, size_t len) return buf; } -int main(int argc, char **argv) -{ - if (!ptls_fusion_is_supported_by_cpu()) { - note("CPU does have the necessary features (avx2, aes, pclmul)\n"); - return done_testing(); - } - - static const uint8_t zero[16384] = {}, one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +static const uint8_t zero[16384] = {}; - { - ptls_fusion_aesecb_context_t ecb; - uint8_t encrypted[16]; - - ptls_fusion_aesecb_init(&ecb, 1, zero, 16); - ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); - ptls_fusion_aesecb_dispose(&ecb); - ok(strcmp(tostr(encrypted, 16), "172afecb50b5f1237814b2f7cb51d0f7") == 0); - - ptls_fusion_aesecb_init(&ecb, 1, zero, 32); - ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); - ptls_fusion_aesecb_dispose(&ecb); - ok(strcmp(tostr(encrypted, 16), "2a033f0627b3554aa4fe5786550736ff") == 0); - } +static void ecb(void) +{ + ptls_fusion_aesecb_context_t ecb; + uint8_t encrypted[16]; + + ptls_fusion_aesecb_init(&ecb, 1, zero, 16); + ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); + ptls_fusion_aesecb_dispose(&ecb); + ok(strcmp(tostr(encrypted, 16), "172afecb50b5f1237814b2f7cb51d0f7") == 0); + + ptls_fusion_aesecb_init(&ecb, 1, zero, 32); + ptls_fusion_aesecb_encrypt(&ecb, encrypted, "hello world!!!!!"); + ptls_fusion_aesecb_dispose(&ecb); + ok(strcmp(tostr(encrypted, 16), "2a033f0627b3554aa4fe5786550736ff") == 0); +} +static void gcm_basic(void) +{ { static const uint8_t expected[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78, 0x97, 0x3f, 0xbc, 0xa6, 0x54, 0x77, 0xbf, 0x47, 0x85, 0xb0, 0xd5, 0x61, 0xf7, 0xe3, 0xfd, 0x6c}; ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, 5 + 16); uint8_t encrypted[sizeof(expected)], decrypted[sizeof(expected) - 16]; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, zero, 16, _mm_setzero_si128(), "hello", 5, NULL); ok(memcmp(expected, encrypted, sizeof(expected)) == 0); - memset(decrypted, 0x55, sizeof(decrypted)); ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 16, _mm_setzero_si128(), "hello", 5, expected + 16)); ok(memcmp(decrypted, zero, sizeof(decrypted)) == 0); - ptls_fusion_aesgcm_free(ctx); } - { /* test capacity */ - static const uint8_t expected[17] = {0x5b, 0x27, 0x21, 0x5e, 0xd8, 0x1a, 0x70, 0x2e, 0x39, - 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, 2); - uint8_t encrypted[17], decrypted[1] = {0x55}; - ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, _mm_setzero_si128(), "a", 1, NULL); - ok(memcmp(expected, encrypted, 17) == 0); - ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, _mm_setzero_si128(), "a", 1, expected + 1)); - ok('X' == decrypted[0]); - ptls_fusion_aesgcm_free(ctx); + { + static const uint8_t key[16] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, + aad[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + iv[] = {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, + plaintext[] = + "hello world\nhello world\nhello world\nhello world\nhello world\nhello world\nhello world\n"; + static const uint8_t expected[] = {0xd3, 0xa8, 0x1d, 0x96, 0x4c, 0x9b, 0x02, 0xd7, 0x9a, 0xb0, 0x41, 0x07, 0x4c, 0x8c, 0xe2, + 0xe0, 0x2e, 0x83, 0x54, 0x52, 0x45, 0xcb, 0xd4, 0x68, 0xc8, 0x43, 0x45, 0xca, 0x91, 0xfb, + 0xa3, 0x7a, 0x67, 0xed, 0xe8, 0xd7, 0x5e, 0xe2, 0x33, 0xd1, 0x3e, 0xbf, 0x50, 0xc2, 0x4b, + 0x86, 0x83, 0x55, 0x11, 0xbb, 0x17, 0x4f, 0xf5, 0x78, 0xb8, 0x65, 0xeb, 0x9a, 0x2b, 0x8f, + 0x77, 0x08, 0xa9, 0x60, 0x17, 0x73, 0xc5, 0x07, 0xf3, 0x04, 0xc9, 0x3f, 0x67, 0x4d, 0x12, + 0xa1, 0x02, 0x93, 0xc2, 0x3c, 0xd3, 0xf8, 0x59, 0x33, 0xd5, 0x01, 0xc3, 0xbb, 0xaa, 0xe6, + 0x3f, 0xbb, 0x23, 0x66, 0x94, 0x26, 0x28, 0x43, 0xa5, 0xfd, 0x2f}; + ptls_aead_context_t *aead = ptls_aead_new_direct(&ptls_fusion_aes128gcm, 0, key, iv); + uint8_t encrypted[sizeof(plaintext) + 16], decrypted[sizeof(plaintext)]; + ptls_aead_encrypt(aead, encrypted, plaintext, sizeof(plaintext), 0, aad, sizeof(aad)); + ok(memcmp(expected, encrypted, sizeof(plaintext)) == 0); + ok(memcmp(expected + sizeof(plaintext), encrypted + sizeof(plaintext), 16) == 0); + ok(ptls_aead_decrypt(aead, decrypted, encrypted, sizeof(encrypted), 0, aad, sizeof(aad)) == sizeof(plaintext)); + ok(memcmp(decrypted, plaintext, sizeof(plaintext)) == 0); + ptls_aead_free(aead); } +} - { - ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, sizeof(zero)); - ptls_aead_supplementary_encryption_t *supp = NULL; +static void gcm_capacity(void) +{ + static const uint8_t expected[17] = {0x5b, 0x27, 0x21, 0x5e, 0xd8, 0x1a, 0x70, 0x2e, 0x39, + 0x41, 0xc8, 0x05, 0x77, 0xd5, 0x2f, 0xcb, 0x57}; + ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, 2); + uint8_t encrypted[17], decrypted[1] = {0x55}; + ptls_fusion_aesgcm_encrypt(ctx, encrypted, "X", 1, _mm_setzero_si128(), "a", 1, NULL); + ok(memcmp(expected, encrypted, 17) == 0); + ok(ptls_fusion_aesgcm_decrypt(ctx, decrypted, expected, 1, _mm_setzero_si128(), "a", 1, expected + 1)); + ok('X' == decrypted[0]); + ptls_fusion_aesgcm_free(ctx); +} + +static void gcm_test_vectors(void) +{ + static const uint8_t one[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + ptls_fusion_aesgcm_context_t *aead = ptls_fusion_aesgcm_new(zero, PTLS_AES128_KEY_SIZE, sizeof(zero)); + ptls_aead_supplementary_encryption_t *supp = NULL; - for (int i = 0; i < 2; ++i) { - uint8_t encrypted[sizeof(zero) + 16], decrypted[sizeof(zero)]; + for (int i = 0; i < 2; ++i) { + uint8_t encrypted[sizeof(zero) + 16], decrypted[sizeof(zero)]; #define DOIT(aad, aadlen, ptlen, expected_tag, expected_supp) \ do { \ memset(encrypted, 0xcc, sizeof(encrypted)); \ ptls_fusion_aesgcm_encrypt(aead, encrypted, zero, ptlen, _mm_setzero_si128(), aad, aadlen, supp); \ - printf("%s\n", tostr(encrypted + ptlen, 16)); \ ok(strcmp(tostr(encrypted + ptlen, 16), expected_tag) == 0); \ if (supp != NULL) \ ok(strcmp(tostr(supp->output, sizeof(supp->output)), expected_supp) == 0); \ @@ -118,65 +138,53 @@ int main(int argc, char **argv) ok(memcmp(decrypted, zero, ptlen) == 0); \ } while (0) - DOIT(zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf", "4576f18ef3ae9dfd37cf72c4592da874"); - DOIT(zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 64, "66165d39739c50c90727e7d49127146b", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 96, "6b9c468e24ed96010687f3880a044d42", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 17, "1b4e515384e8aa5bb781ee12549a2ccf", "4576f18ef3ae9dfd37cf72c4592da874"); + DOIT(zero, 13, 32, "84030586f55adf8ac3c145913c6fd0f8", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 64, "66165d39739c50c90727e7d49127146b", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 65, "eb3b75e1d4431e1bb67da46f6a1a0edd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 79, "8f4a96c7390c26bb15b68865e6a861b9", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 80, "5cc2554857b19e7a9e18d015feac61fd", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 81, "5a65f0d4db36c981bf7babd11691fe78", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 95, "6a8a51152efe928999a610d8a7b1df9d", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 96, "6b9c468e24ed96010687f3880a044d42", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 97, "1b4eb785b884a7d4fdebaff81c1c12e8", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 22, 1328, "0507baaece8d573774c94e8103821316", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 21, 1329, "dd70d59030eadb6313e778046540a253", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 20, 1330, "f1b456b955afde7603188af0124a32ef", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 22, 1328, "0507baaece8d573774c94e8103821316", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 21, 1329, "dd70d59030eadb6313e778046540a253", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 20, 1330, "f1b456b955afde7603188af0124a32ef", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 13, 1337, "a22deec51250a7eb1f4384dea5f2e890", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 12, 1338, "42102b0a499b2efa89702ece4b0c5789", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 11, 1339, "9827f0b34252160d0365ffaa9364bedc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 0, 80, "98885a3a22bd4742fe7b72172193b163", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 0, 80, "98885a3a22bd4742fe7b72172193b163", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 0, 96, "afd649fc51e14f3966e4518ad53b9ddc", "a062016e90dcc316d061fde5424cf34f"); - DOIT(zero, 20, 85, "afe8b727057c804a0525c2914ef856b0", "a062016e90dcc316d061fde5424cf34f"); + DOIT(zero, 20, 85, "afe8b727057c804a0525c2914ef856b0", "a062016e90dcc316d061fde5424cf34f"); #undef DOIT - supp = malloc(sizeof(*supp)); - supp->ctx = ptls_cipher_new(&ptls_fusion_aes128ctr, 1, one); - supp->input = encrypted + 2; - } - - ptls_cipher_free(supp->ctx); - free(supp); - ptls_fusion_aesgcm_free(aead); + supp = malloc(sizeof(*supp)); + supp->ctx = ptls_cipher_new(&ptls_fusion_aes128ctr, 1, one); + supp->input = encrypted + 2; } - { - static const uint8_t key[16] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, - aad[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, - iv[] = {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, - plaintext[] = - "hello world\nhello world\nhello world\nhello world\nhello world\nhello world\nhello world\n"; - static const uint8_t expected[] = {0xd3, 0xa8, 0x1d, 0x96, 0x4c, 0x9b, 0x02, 0xd7, 0x9a, 0xb0, 0x41, 0x07, 0x4c, 0x8c, 0xe2, - 0xe0, 0x2e, 0x83, 0x54, 0x52, 0x45, 0xcb, 0xd4, 0x68, 0xc8, 0x43, 0x45, 0xca, 0x91, 0xfb, - 0xa3, 0x7a, 0x67, 0xed, 0xe8, 0xd7, 0x5e, 0xe2, 0x33, 0xd1, 0x3e, 0xbf, 0x50, 0xc2, 0x4b, - 0x86, 0x83, 0x55, 0x11, 0xbb, 0x17, 0x4f, 0xf5, 0x78, 0xb8, 0x65, 0xeb, 0x9a, 0x2b, 0x8f, - 0x77, 0x08, 0xa9, 0x60, 0x17, 0x73, 0xc5, 0x07, 0xf3, 0x04, 0xc9, 0x3f, 0x67, 0x4d, 0x12, - 0xa1, 0x02, 0x93, 0xc2, 0x3c, 0xd3, 0xf8, 0x59, 0x33, 0xd5, 0x01, 0xc3, 0xbb, 0xaa, 0xe6, - 0x3f, 0xbb, 0x23, 0x66, 0x94, 0x26, 0x28, 0x43, 0xa5, 0xfd, 0x2f}; - ptls_aead_context_t *aead = ptls_aead_new_direct(&ptls_fusion_aes128gcm, 0, key, iv); - uint8_t encrypted[sizeof(plaintext) + 16], decrypted[sizeof(plaintext)]; - ptls_aead_encrypt(aead, encrypted, plaintext, sizeof(plaintext), 0, aad, sizeof(aad)); - ok(memcmp(expected, encrypted, sizeof(plaintext)) == 0); - ok(memcmp(expected + sizeof(plaintext), encrypted + sizeof(plaintext), 16) == 0); - ok(ptls_aead_decrypt(aead, decrypted, encrypted, sizeof(encrypted), 0, aad, sizeof(aad)) == sizeof(plaintext)); - ok(memcmp(decrypted, plaintext, sizeof(plaintext)) == 0); - ptls_aead_free(aead); + ptls_cipher_free(supp->ctx); + free(supp); + ptls_fusion_aesgcm_free(aead); +} + +int main(int argc, char **argv) +{ + if (!ptls_fusion_is_supported_by_cpu()) { + note("CPU does have the necessary features (avx2, aes, pclmul)\n"); + return done_testing(); } + subtest("ecb", ecb); + subtest("gcm-basic", gcm_basic); + subtest("gcm-capacity", gcm_capacity); + subtest("gcm-test-vectors", gcm_test_vectors); + return done_testing(); } From b531bae89d80ed0832e97ef4714959457d43b959 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 11:26:28 +0900 Subject: [PATCH 49/60] run AEAD test vectors using minicrypto --- CMakeLists.txt | 1 + picotls.xcodeproj/project.pbxproj | 10 +++++++ t/fusion.c | 45 +++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc3cb0045..48c273139 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ IF ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND lib/picotls.c lib/fusion.c t/fusion.c) + TARGET_LINK_LIBRARIES(test-fusion.t picotls-minicrypto) SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-mavx2 -maes -mpclmul") ADD_DEPENDENCIES(test-fusion.t generate-picotls-probes) SET(TEST_EXES ${TEST_EXES} test-fusion.t) diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 03bb2238d..dcc38cbe5 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -91,6 +91,7 @@ 10EACB1A1DCEC2A300CA0341 /* libpicotls-core.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 106530DA1D9B3E6F005B2C60 /* libpicotls-core.a */; }; E949EF282073629300511ECA /* minicrypto-pem.c in Sources */ = {isa = PBXBuildFile; fileRef = E949EF272073629300511ECA /* minicrypto-pem.c */; }; E95E95382290456B00215ACD /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; + E973651E246E37300039AA49 /* libpicotls-minicrypto.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 10EACB171DCEAF0F00CA0341 /* libpicotls-minicrypto.a */; }; E97577012212405300D1EF74 /* ffx.h in Headers */ = {isa = PBXBuildFile; fileRef = E97577002212405300D1EF74 /* ffx.h */; }; E97577032212405D00D1EF74 /* ffx.c in Sources */ = {isa = PBXBuildFile; fileRef = E97577022212405D00D1EF74 /* ffx.c */; }; E97577042212407900D1EF74 /* ffx.c in Sources */ = {isa = PBXBuildFile; fileRef = E97577022212405D00D1EF74 /* ffx.c */; }; @@ -378,6 +379,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + E973651E246E37300039AA49 /* libpicotls-minicrypto.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -451,6 +453,7 @@ E992F79920E99A080008154D /* src */, 106530C41D9B1A0E005B2C60 /* t */, 106530B31D9985E0005B2C60 /* Products */, + E973651D246E37300039AA49 /* Frameworks */, ); sourceTree = ""; }; @@ -570,6 +573,13 @@ path = cmake; sourceTree = ""; }; + E973651D246E37300039AA49 /* Frameworks */ = { + isa = PBXGroup; + children = ( + ); + name = Frameworks; + sourceTree = ""; + }; E992F79920E99A080008154D /* src */ = { isa = PBXGroup; children = ( diff --git a/t/fusion.c b/t/fusion.c index 8adc21bae..5694ee09d 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -23,6 +23,7 @@ #include #include #include "picotls/fusion.h" +#include "picotls/minicrypto.h" #include "../deps/picotest/picotest.h" static const char *tostr(const void *_p, size_t len) @@ -174,6 +175,49 @@ static void gcm_test_vectors(void) ptls_fusion_aesgcm_free(aead); } +static void test_generated(void) +{ + ptls_cipher_context_t *rand = ptls_cipher_new(&ptls_minicrypto_aes128ctr, 1, zero); + ptls_cipher_init(rand, zero); + + for (int i = 0; i < 10000; ++i) { + /* generate input using RNG */ + uint8_t key[32], iv[12], aadlen, textlen; + uint64_t seq; + ptls_cipher_encrypt(rand, key, zero, sizeof(key)); + ptls_cipher_encrypt(rand, iv, zero, sizeof(iv)); + ptls_cipher_encrypt(rand, &aadlen, zero, sizeof(aadlen)); + ptls_cipher_encrypt(rand, &textlen, zero, sizeof(textlen)); + ptls_cipher_encrypt(rand, &seq, zero, sizeof(seq)); + uint8_t aad[aadlen], text[textlen]; + ptls_cipher_encrypt(rand, aad, zero, sizeof(aad)); + ptls_cipher_encrypt(rand, text, zero, sizeof(text)); + + uint8_t encrypted[textlen + 16], decrypted[textlen]; + memset(encrypted, 0x55, sizeof(encrypted)); + memset(decrypted, 0xcc, sizeof(decrypted)); + + { /* check using fusion */ + ptls_aead_context_t *fusion = ptls_aead_new_direct(&ptls_fusion_aes128gcm, 1, key, iv); + ptls_aead_encrypt(fusion, encrypted, text, textlen, seq, aad, aadlen); + ok(ptls_aead_decrypt(fusion, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); + ok(memcmp(decrypted, text, textlen) == 0); + ptls_aead_free(fusion); + } + + memset(decrypted, 0xcc, sizeof(decrypted)); + + { /* check that the encrypted text can be decrypted by OpenSSL */ + ptls_aead_context_t *mc = ptls_aead_new_direct(&ptls_minicrypto_aes128gcm, 0, key, iv); + ok(ptls_aead_decrypt(mc, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); + ok(memcmp(decrypted, text, textlen) == 0); + ptls_aead_free(mc); + } + } + + ptls_cipher_free(rand); +} + int main(int argc, char **argv) { if (!ptls_fusion_is_supported_by_cpu()) { @@ -185,6 +229,7 @@ int main(int argc, char **argv) subtest("gcm-basic", gcm_basic); subtest("gcm-capacity", gcm_capacity); subtest("gcm-test-vectors", gcm_test_vectors); + subtest("generated", test_generated); return done_testing(); } From 6b84978c09938af66d26189c0cfe868ef3fb84dc Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Fri, 15 May 2020 11:44:21 +0900 Subject: [PATCH 50/60] expose picotls identifiers for fusion-aes256, add test --- include/picotls/fusion.h | 4 ++-- lib/fusion.c | 44 +++++++++++++++++++++++++++++++++++----- t/fusion.c | 21 +++++++++++++++---- 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index 85cdc0db0..c11de99c8 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -80,8 +80,8 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr, const void *aad, size_t aadlen, const void *tag); -extern ptls_cipher_algorithm_t ptls_fusion_aes128ctr; -extern ptls_aead_algorithm_t ptls_fusion_aes128gcm; +extern ptls_cipher_algorithm_t ptls_fusion_aes128ctr, ptls_fusion_aes256ctr; +extern ptls_aead_algorithm_t ptls_fusion_aes128gcm, ptls_fusion_aes256gcm; /** * Returns a boolean indicating if fusion can be used. diff --git a/lib/fusion.c b/lib/fusion.c index 8be2bcb88..adff45e3d 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -786,19 +786,29 @@ static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void } } -static int aes128ctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key) +static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size) { struct ctr_context *ctx = (struct ctr_context *)_ctx; ctx->super.do_dispose = ctr_dispose; ctx->super.do_init = ctr_init; ctx->super.do_transform = ctr_transform; - ptls_fusion_aesecb_init(&ctx->fusion, 1, key, PTLS_AES128_KEY_SIZE); + ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size); ctx->is_ready = 0; return 0; } +static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) +{ + return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); +} + +static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) +{ + return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); +} + static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; @@ -855,7 +865,7 @@ static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const voi return enclen; } -static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv) +static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size) { struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; @@ -869,18 +879,34 @@ static int aes128gcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *ke ctx->super.do_encrypt = aead_do_encrypt; ctx->super.do_decrypt = aead_do_decrypt; - ctx->aesgcm = ptls_fusion_aesgcm_new(key, PTLS_AES128_KEY_SIZE, - 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ + ctx->aesgcm = + ptls_fusion_aesgcm_new(key, key_size, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ return 0; } +static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) +{ + return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE); +} + +static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) +{ + return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE); +} + ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR", PTLS_AES128_KEY_SIZE, 1, // block size PTLS_AES_IV_SIZE, sizeof(struct ctr_context), aes128ctr_setup}; +ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR", + PTLS_AES256_KEY_SIZE, + 1, // block size + PTLS_AES_IV_SIZE, + sizeof(struct ctr_context), + aes256ctr_setup}; ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", &ptls_fusion_aes128ctr, NULL, // &ptls_fusion_aes128ecb, @@ -889,6 +915,14 @@ ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", PTLS_AESGCM_TAG_SIZE, sizeof(struct aesgcm_context), aes128gcm_setup}; +ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM", + &ptls_fusion_aes256ctr, + NULL, // &ptls_fusion_aes256ecb, + PTLS_AES256_KEY_SIZE, + PTLS_AESGCM_IV_SIZE, + PTLS_AESGCM_TAG_SIZE, + sizeof(struct aesgcm_context), + aes256gcm_setup}; int ptls_fusion_is_supported_by_cpu(void) { diff --git a/t/fusion.c b/t/fusion.c index 5694ee09d..7b7a2ddda 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -175,7 +175,7 @@ static void gcm_test_vectors(void) ptls_fusion_aesgcm_free(aead); } -static void test_generated(void) +static void test_generated(int aes256) { ptls_cipher_context_t *rand = ptls_cipher_new(&ptls_minicrypto_aes128ctr, 1, zero); ptls_cipher_init(rand, zero); @@ -198,7 +198,8 @@ static void test_generated(void) memset(decrypted, 0xcc, sizeof(decrypted)); { /* check using fusion */ - ptls_aead_context_t *fusion = ptls_aead_new_direct(&ptls_fusion_aes128gcm, 1, key, iv); + ptls_aead_context_t *fusion = + ptls_aead_new_direct(aes256 ? &ptls_fusion_aes256gcm : &ptls_fusion_aes128gcm, 1, key, iv); ptls_aead_encrypt(fusion, encrypted, text, textlen, seq, aad, aadlen); ok(ptls_aead_decrypt(fusion, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); ok(memcmp(decrypted, text, textlen) == 0); @@ -208,7 +209,8 @@ static void test_generated(void) memset(decrypted, 0xcc, sizeof(decrypted)); { /* check that the encrypted text can be decrypted by OpenSSL */ - ptls_aead_context_t *mc = ptls_aead_new_direct(&ptls_minicrypto_aes128gcm, 0, key, iv); + ptls_aead_context_t *mc = + ptls_aead_new_direct(aes256 ? &ptls_minicrypto_aes256gcm : &ptls_minicrypto_aes128gcm, 0, key, iv); ok(ptls_aead_decrypt(mc, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); ok(memcmp(decrypted, text, textlen) == 0); ptls_aead_free(mc); @@ -218,6 +220,16 @@ static void test_generated(void) ptls_cipher_free(rand); } +static void test_generated_aes128(void) +{ + test_generated(0); +} + +static void test_generated_aes256(void) +{ + test_generated(1); +} + int main(int argc, char **argv) { if (!ptls_fusion_is_supported_by_cpu()) { @@ -229,7 +241,8 @@ int main(int argc, char **argv) subtest("gcm-basic", gcm_basic); subtest("gcm-capacity", gcm_capacity); subtest("gcm-test-vectors", gcm_test_vectors); - subtest("generated", test_generated); + subtest("generated-128", test_generated_aes128); + subtest("generated-256", test_generated_aes256); return done_testing(); } From 93dbbda9e090f85ea50326d5554d145c3663420d Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 18 May 2020 13:32:23 +0900 Subject: [PATCH 51/60] lessen the output (for travis) --- t/fusion.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/t/fusion.c b/t/fusion.c index 7b7a2ddda..7df7e3a5e 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -179,8 +179,9 @@ static void test_generated(int aes256) { ptls_cipher_context_t *rand = ptls_cipher_new(&ptls_minicrypto_aes128ctr, 1, zero); ptls_cipher_init(rand, zero); + int i; - for (int i = 0; i < 10000; ++i) { + for (i = 0; i < 10000; ++i) { /* generate input using RNG */ uint8_t key[32], iv[12], aadlen, textlen; uint64_t seq; @@ -201,8 +202,10 @@ static void test_generated(int aes256) ptls_aead_context_t *fusion = ptls_aead_new_direct(aes256 ? &ptls_fusion_aes256gcm : &ptls_fusion_aes128gcm, 1, key, iv); ptls_aead_encrypt(fusion, encrypted, text, textlen, seq, aad, aadlen); - ok(ptls_aead_decrypt(fusion, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); - ok(memcmp(decrypted, text, textlen) == 0); + if (ptls_aead_decrypt(fusion, decrypted, encrypted, textlen + 16, seq, aad, aadlen) != textlen) + goto Fail; + if (memcmp(decrypted, text, textlen) != 0) + goto Fail; ptls_aead_free(fusion); } @@ -211,13 +214,21 @@ static void test_generated(int aes256) { /* check that the encrypted text can be decrypted by OpenSSL */ ptls_aead_context_t *mc = ptls_aead_new_direct(aes256 ? &ptls_minicrypto_aes256gcm : &ptls_minicrypto_aes128gcm, 0, key, iv); - ok(ptls_aead_decrypt(mc, decrypted, encrypted, textlen + 16, seq, aad, aadlen) == textlen); - ok(memcmp(decrypted, text, textlen) == 0); + if (ptls_aead_decrypt(mc, decrypted, encrypted, textlen + 16, seq, aad, aadlen) != textlen) + goto Fail; + if (memcmp(decrypted, text, textlen) != 0) + goto Fail; ptls_aead_free(mc); } } + ok(1); ptls_cipher_free(rand); + return; + +Fail: + note("mismatch at index=%d", i); + ok(0); } static void test_generated_aes128(void) From 7fd7c8431b7cb781304b57964dfac4aa6926ebd2 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 18 May 2020 14:04:42 +0900 Subject: [PATCH 52/60] auto-expand --- include/picotls/fusion.h | 8 ++++-- lib/fusion.c | 62 ++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/include/picotls/fusion.h b/include/picotls/fusion.h index c11de99c8..332dd93ab 100644 --- a/include/picotls/fusion.h +++ b/include/picotls/fusion.h @@ -47,9 +47,13 @@ void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, co /** * Creates an AES-GCM context. * @param key the AES key (128 bits) - * @param max_size maximum size of the record (i.e. AAD + encrypted payload) + * @param capacity maximum size of AEAD record (i.e. AAD + encrypted payload) */ -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t max_size); +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity); +/** + * Updates the capacity. + */ +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity); /** * Destroys an AES-GCM context. */ diff --git a/lib/fusion.c b/lib/fusion.c index adff45e3d..1bd40b93d 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -49,6 +49,7 @@ struct ptls_fusion_aesgcm_context { ptls_fusion_aesecb_context_t ecb; + size_t capacity; size_t ghash_cnt; struct ptls_fusion_aesgcm_ghash_precompute { __m128i H; @@ -722,29 +723,62 @@ void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, co _mm_storeu_si128(dst, v); } -ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t max_size) +/** + * returns the number of ghash entries that is required to handle an AEAD block of given size + */ +static size_t aesgcm_calc_ghash_cnt(size_t capacity) +{ + // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC + return (capacity + 15) / 16 + 2; +} + +static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx) +{ + if (ctx->ghash_cnt != 0) + ctx->ghash[ctx->ghash_cnt].H = gfmul(ctx->ghash[ctx->ghash_cnt - 1].H, ctx->ghash[0].H); + + __m128i r = _mm_shuffle_epi32(ctx->ghash[ctx->ghash_cnt].H, 78); + r = _mm_xor_si128(r, ctx->ghash[ctx->ghash_cnt].H); + ctx->ghash[ctx->ghash_cnt].r = r; + + ++ctx->ghash_cnt; +} + +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity) { ptls_fusion_aesgcm_context_t *ctx; - size_t ghash_cnt = (max_size + 15) / 16 + 2; // round-up by block size, add to handle worst split of the size between AAD and - // payload, plus context to hash AC + size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity); if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) return NULL; ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size); - ctx->ghash_cnt = ghash_cnt; + ctx->capacity = capacity; + ctx->ghash[0].H = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128()); ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8); - ctx->ghash[0].H = transformH(ctx->ghash[0].H); - for (int i = 1; i < ghash_cnt; ++i) - ctx->ghash[i].H = gfmul(ctx->ghash[i - 1].H, ctx->ghash[0].H); - for (int i = 0; i < ghash_cnt; ++i) { - __m128i r = _mm_shuffle_epi32(ctx->ghash[i].H, 78); - r = _mm_xor_si128(r, ctx->ghash[i].H); - ctx->ghash[i].r = r; - } + ctx->ghash_cnt = 0; + while (ctx->ghash_cnt < ghash_cnt) + setup_one_ghash_entry(ctx); + + return ctx; +} + +ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity) +{ + size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity); + + if (ghash_cnt <= ctx->ghash_cnt) + return ctx; + + if ((ctx = realloc(ctx, sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL) + return NULL; + + ctx->capacity = capacity; + while (ghash_cnt < ctx->ghash_cnt) + setup_one_ghash_entry(ctx); return ctx; } @@ -847,6 +881,8 @@ void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const vo { struct aesgcm_context *ctx = (void *)_ctx; + if (inlen + aadlen > ctx->aesgcm->capacity) + ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen); ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp); } @@ -859,6 +895,8 @@ static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const voi return SIZE_MAX; size_t enclen = inlen - 16; + if (enclen + aadlen > ctx->aesgcm->capacity) + ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen); if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen, (const uint8_t *)input + enclen)) return SIZE_MAX; From f950d652f9240f46b11291e225ddd071a99ef97d Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 18 May 2020 14:04:42 +0900 Subject: [PATCH 53/60] remove obsolete FIXME --- lib/fusion.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 1bd40b93d..eaa45e191 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -917,8 +917,7 @@ static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, ctx->super.do_encrypt = aead_do_encrypt; ctx->super.do_decrypt = aead_do_decrypt; - ctx->aesgcm = - ptls_fusion_aesgcm_new(key, key_size, 1500); /* FIXME use realloc with exponential back-off to support arbitrary size */ + ctx->aesgcm = ptls_fusion_aesgcm_new(key, key_size, 1500 /* assume ordinary packet size */); return 0; } From ea21c50c79aeefbbf9a64a83eb0fb886ca1fc3aa Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 18 May 2020 16:25:53 +0900 Subject: [PATCH 54/60] reduce redundancy --- lib/fusion.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index eaa45e191..a4535fdc2 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -349,17 +349,13 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, if (srclen != 0) { #define APPLY(i) \ do { \ - if (srclen >= 16) { \ + if (PTLS_LIKELY(srclen >= 16)) { \ _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \ srclen -= 16; \ + } else if (PTLS_LIKELY(srclen != 0)) { \ + bits0 = bits##i; \ + goto ApplyRemainder; \ } else { \ - if (srclen != 0) { \ - /* While it is possible to use _mm_storeu_si128 here, as there is space to store GCM tag, writing byte-per-byte \ - * seems to be faster on 9th gen Core. */ \ - storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits##i)); \ - dst = (__m128i *)((uint8_t *)dst + srclen); \ - srclen = 0; \ - } \ goto ApplyEnd; \ } \ } while (0) @@ -369,8 +365,13 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, APPLY(3); APPLY(4); APPLY(5); - ApplyEnd:; #undef APPLY + goto ApplyEnd; + ApplyRemainder: + storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits0)); + dst = (__m128i *)((uint8_t *)dst + srclen); + srclen = 0; + ApplyEnd:; } } @@ -613,12 +614,7 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \ src_aeslen -= 16; \ } else { \ - if (src_aeslen == 16) { \ - _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits##i)); \ - } else if (src_aeslen != 0) { \ - storen(dst, src_aeslen, _mm_xor_si128(loadn(src_aes, src_aeslen), bits##i)); \ - } \ - src_aeslen = 0; \ + bits0 = bits##i; \ goto Finish; \ } APPLY(0); @@ -634,6 +630,12 @@ int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } Finish: + if (src_aeslen == 16) { + _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0)); + } else if (src_aeslen != 0) { + storen(dst, src_aeslen, _mm_xor_si128(loadn(src_aes, src_aeslen), bits0)); + } + assert((state & STATE_IS_FIRST_RUN) == 0); /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */ From d8dc699d6793876a71be31b5dc07bb41c3bca104 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Tue, 19 May 2020 14:02:21 +0900 Subject: [PATCH 55/60] run GHASH of AAD and first AES permutation in parallel --- lib/fusion.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index a4535fdc2..514e03d8a 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -290,9 +290,6 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, __m128i gdatabuf[6]; __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8); - const __m128i *gdata; // points to the elements fed into GHASH - size_t gdata_cnt; - // src and dst are updated after the chunk is processed const __m128i *src = input; __m128i *dst = output; @@ -314,16 +311,40 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, ctr = _mm_insert_epi32(ctr, 1, 0); ek0 = _mm_shuffle_epi8(ctr, bswap8); - { /* prepare the first bit stream */ - size_t i; - AESECB6_INIT(); - for (i = 1; i < ctx->ecb.rounds; ++i) - AESECB6_UPDATE(i); - AESECB6_FINAL(i); + /* start preparing AES */ + AESECB6_INIT(); + AESECB6_UPDATE(1); + + /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */ + const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH + size_t gdata_cnt = 0; + if (PTLS_LIKELY(aadlen != 0)) { + while (gdata_cnt < 6) { + if (PTLS_LIKELY(aadlen < 16)) { + if (aadlen != 0) { + gdatabuf[gdata_cnt++] = loadn(aad, aadlen); + aadlen = 0; + } + goto MainLoop; + } + gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); + aadlen -= 16; + } } /* the main loop */ +MainLoop: while (1) { + /* run AES and multiplication in parallel */ + size_t i; + for (i = 2; i < gdata_cnt + 2; ++i) { + AESECB6_UPDATE(i); + gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); + } + for (; i < ctx->ecb.rounds; ++i) + AESECB6_UPDATE(i); + AESECB6_FINAL(i); + /* apply the bit stream to src and write to dest */ if (PTLS_LIKELY(srclen >= 6 * 16)) { #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i)) @@ -418,16 +439,6 @@ void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, } gdata = gdatabuf; } - - /* run AES and multiplication in parallel */ - size_t i; - for (i = 2; i <= 7; ++i) { - AESECB6_UPDATE(i); - gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); - } - for (; i < ctx->ecb.rounds; ++i) - AESECB6_UPDATE(i); - AESECB6_FINAL(i); } Finish: From 122dd009c2e9251ad0dc81ebf8b34996f23e2b8f Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 17 May 2020 17:50:47 +0900 Subject: [PATCH 56/60] add test for loadn --- CMakeLists.txt | 1 - picotls.xcodeproj/project.pbxproj | 2 -- t/fusion.c | 23 +++++++++++++++++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 48c273139..0b9429bd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,7 +142,6 @@ IF ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND ADD_EXECUTABLE(test-fusion.t deps/picotest/picotest.c lib/picotls.c - lib/fusion.c t/fusion.c) TARGET_LINK_LIBRARIES(test-fusion.t picotls-minicrypto) SET_TARGET_PROPERTIES(test-fusion.t PROPERTIES COMPILE_FLAGS "-mavx2 -maes -mpclmul") diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index dcc38cbe5..220041929 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -117,7 +117,6 @@ E9B43DE324619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; - E9B43DE724652D2000824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43E0B24689E8900824E51 /* fusionbench.c */; }; E9B43E0D24689EDA00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; E9B43E0E24689F9500824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; @@ -1009,7 +1008,6 @@ files = ( E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */, E9B43E1024689FE700824E51 /* picotls.c in Sources */, - E9B43DE724652D2000824E51 /* fusion.c in Sources */, E9B43DE424619D7E00824E51 /* fusion.c in Sources */, E9B43DC824619D5100824E51 /* picotest.c in Sources */, ); diff --git a/t/fusion.c b/t/fusion.c index 7df7e3a5e..c51e88e00 100644 --- a/t/fusion.c +++ b/t/fusion.c @@ -25,6 +25,7 @@ #include "picotls/fusion.h" #include "picotls/minicrypto.h" #include "../deps/picotest/picotest.h" +#include "../lib/fusion.c" static const char *tostr(const void *_p, size_t len) { @@ -47,9 +48,26 @@ static const char *tostr(const void *_p, size_t len) return buf; } +static void test_loadn(void) +{ + uint8_t buf[8192] = {}; + + for (size_t off = 0; off < 8192 - 15; ++off) { + uint8_t *src = buf + off; + memcpy(src, "hello world12345", 16); + __m128i v = loadn(src, 11); + if (memcmp(&v, "hello world\0\0\0\0\0", 16) != 0) { + ok(!"fail"); + return; + } + memset(src, 0, 11); + } + ok(!!"success"); +} + static const uint8_t zero[16384] = {}; -static void ecb(void) +static void test_ecb(void) { ptls_fusion_aesecb_context_t ecb; uint8_t encrypted[16]; @@ -248,7 +266,8 @@ int main(int argc, char **argv) return done_testing(); } - subtest("ecb", ecb); + subtest("loadn", test_loadn); + subtest("ecb", test_ecb); subtest("gcm-basic", gcm_basic); subtest("gcm-capacity", gcm_capacity); subtest("gcm-test-vectors", gcm_test_vectors); From eeff1640b7ce1052d3ada19aa1b06518bcde7957 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 19 May 2020 14:46:39 +0900 Subject: [PATCH 57/60] use pshufb when avoiding cross-page-boundary load --- lib/fusion.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/fusion.c b/lib/fusion.c index 514e03d8a..43645c9f6 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -195,20 +195,27 @@ static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i return v; } -static inline __m128i loadn(const void *_p, size_t l) +static const uint8_t loadn_mask[31] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; +static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero + +static inline __m128i loadn(const void *p, size_t l) { - /* FIXME is this optimal? */ - if (PTLS_LIKELY(((uintptr_t)_p % 4096) <= 4080)) { - static const uint8_t mask[31] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - return _mm_and_si128(_mm_loadu_si128(_p), _mm_loadu_si128((__m128i *)(mask + 16 - l))); + __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 16 - l)); + uintptr_t mod4k = (uintptr_t)p % 4096; + + if (PTLS_LIKELY(mod4k <= 4080) || mod4k + l > 4096) { + v = _mm_loadu_si128(p); } else { - const uint8_t *p = _p; - uint8_t buf[16] = {}; - for (size_t i = 0; i != l; ++i) - buf[i] = p[i]; - return *(__m128i *)buf; + uintptr_t shift = (uintptr_t)p & 15; + __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift)); + v = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern); } + v = _mm_and_si128(v, mask); + return v; } static inline void storen(void *_p, size_t l, __m128i v) From ae2aedaa152f9a012948a474ee67a0454290a510 Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 14 Jun 2020 15:13:18 +0900 Subject: [PATCH 58/60] at the internal API-level, preserve the capability of setting IV --- lib/fusion.c | 2 ++ lib/openssl.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/fusion.c b/lib/fusion.c index 43645c9f6..3af2bd175 100644 --- a/lib/fusion.c +++ b/lib/fusion.c @@ -929,6 +929,8 @@ static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, ctx->static_iv = loadn(iv, PTLS_AESGCM_IV_SIZE); ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, bswap8); + if (key == NULL) + return 0; ctx->super.dispose_crypto = aesgcm_dispose_crypto; ctx->super.do_encrypt_init = aead_do_encrypt_init; diff --git a/lib/openssl.c b/lib/openssl.c index 4bf92aecf..d907346cf 100644 --- a/lib/openssl.c +++ b/lib/openssl.c @@ -859,6 +859,10 @@ static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void * struct aead_crypto_context_t *ctx = (struct aead_crypto_context_t *)_ctx; int ret; + memcpy(ctx->static_iv, iv, ctx->super.algo->iv_size); + if (key == NULL) + return 0; + ctx->super.dispose_crypto = aead_dispose_crypto; if (is_enc) { ctx->super.do_encrypt_init = aead_do_encrypt_init; @@ -893,7 +897,6 @@ static int aead_setup_crypto(ptls_aead_context_t *_ctx, int is_enc, const void * ret = PTLS_ERROR_LIBRARY; goto Error; } - memcpy(ctx->static_iv, iv, ctx->super.algo->iv_size); return 0; From 5e8d4e3f3aa3963507f9651935da727be895ca0f Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Sun, 14 Jun 2020 15:30:52 +0900 Subject: [PATCH 59/60] t/fusion.c not used by picotls-core --- picotls.xcodeproj/project.pbxproj | 2 -- 1 file changed, 2 deletions(-) diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 220041929..20bae446a 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -114,7 +114,6 @@ E99B75E51F5CE64E00CF503E /* pembase64.c in Sources */ = {isa = PBXBuildFile; fileRef = E99B75DF1F5CDDB500CF503E /* pembase64.c */; }; E9B43DC224619D5100824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; E9B43DC824619D5100824E51 /* picotest.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E31D9B4021005B2C60 /* picotest.c */; }; - E9B43DE324619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43E0B24689E8900824E51 /* fusionbench.c */; }; @@ -947,7 +946,6 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - E9B43DE324619D7E00824E51 /* fusion.c in Sources */, E99B75E01F5CDDB500CF503E /* asn1.c in Sources */, E99B75E11F5CDDB500CF503E /* pembase64.c in Sources */, E95E95382290456B00215ACD /* picotls-probes.d in Sources */, From 2ab530c5517615fd033cd5a2bd4753b2c1b95dfd Mon Sep 17 00:00:00 2001 From: Kazuho Oku Date: Mon, 15 Jun 2020 08:20:45 +0900 Subject: [PATCH 60/60] move fusionbench.c out from picotls; it's now available at https://gist.github.com/kazuho/1ba0bc3a447f5ab7008abdbf9bea9e6b --- picotls.xcodeproj/project.pbxproj | 82 ------------------------------- t/fusionbench.c | 75 ---------------------------- 2 files changed, 157 deletions(-) delete mode 100644 t/fusionbench.c diff --git a/picotls.xcodeproj/project.pbxproj b/picotls.xcodeproj/project.pbxproj index 20bae446a..3fe64e924 100644 --- a/picotls.xcodeproj/project.pbxproj +++ b/picotls.xcodeproj/project.pbxproj @@ -116,10 +116,6 @@ E9B43DC824619D5100824E51 /* picotest.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530E31D9B4021005B2C60 /* picotest.c */; }; E9B43DE424619D7E00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DE224619D7E00824E51 /* fusion.c */; }; E9B43DE524619E1600824E51 /* minicrypto.c in Sources */ = {isa = PBXBuildFile; fileRef = 1059003D1DC8D4E300FB4085 /* minicrypto.c */; }; - E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43E0B24689E8900824E51 /* fusionbench.c */; }; - E9B43E0D24689EDA00824E51 /* fusion.c in Sources */ = {isa = PBXBuildFile; fileRef = E9B43DBF24619D1700824E51 /* fusion.c */; }; - E9B43E0E24689F9500824E51 /* picotls-probes.d in Sources */ = {isa = PBXBuildFile; fileRef = E95EBCC0227B71170022C32D /* picotls-probes.d */; }; - E9B43E0F24689FAC00824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530BF1D998641005B2C60 /* picotls.c */; }; E9B43E1024689FE700824E51 /* picotls.c in Sources */ = {isa = PBXBuildFile; fileRef = 106530BF1D998641005B2C60 /* picotls.c */; }; E9BC76CF1EF3A35E00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; E9BC76D21EF3A36A00EB7A09 /* chacha20.c in Sources */ = {isa = PBXBuildFile; fileRef = E9BC76C61EF3A2F700EB7A09 /* chacha20.c */; }; @@ -223,15 +219,6 @@ ); runOnlyForDeploymentPostprocessing = 1; }; - E9B43E0624689E5900824E51 /* CopyFiles */ = { - isa = PBXCopyFilesBuildPhase; - buildActionMask = 2147483647; - dstPath = /usr/share/man/man1/; - dstSubfolderSpec = 0; - files = ( - ); - runOnlyForDeploymentPostprocessing = 1; - }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ @@ -300,8 +287,6 @@ E9B43DE124619D5100824E51 /* test-fusion */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "test-fusion"; sourceTree = BUILT_PRODUCTS_DIR; }; E9B43DE224619D7E00824E51 /* fusion.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = fusion.c; sourceTree = ""; }; E9B43DE62461A06800824E51 /* fusion.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = fusion.h; sourceTree = ""; }; - E9B43E0A24689E5900824E51 /* fusionbench */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = fusionbench; sourceTree = BUILT_PRODUCTS_DIR; }; - E9B43E0B24689E8900824E51 /* fusionbench.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fusionbench.c; sourceTree = ""; }; E9BC76C61EF3A2F700EB7A09 /* chacha20.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = chacha20.c; path = src/chacha20.c; sourceTree = ""; }; E9BC76CC1EF3A31000EB7A09 /* salsa20.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = salsa20.h; path = src/salsa20.h; sourceTree = ""; }; E9BC76D61EF3C1C200EB7A09 /* poly1305.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = poly1305.c; path = src/poly1305.c; sourceTree = ""; }; @@ -381,13 +366,6 @@ ); runOnlyForDeploymentPostprocessing = 0; }; - E9B43E0524689E5900824E51 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - ); - runOnlyForDeploymentPostprocessing = 0; - }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ @@ -466,7 +444,6 @@ 10EACB171DCEAF0F00CA0341 /* libpicotls-minicrypto.a */, E992F7A920E99A7C0008154D /* picotls-esni */, E9B43DE124619D5100824E51 /* test-fusion */, - E9B43E0A24689E5900824E51 /* fusionbench */, ); name = Products; sourceTree = ""; @@ -504,7 +481,6 @@ 106530FE1DAD8A3C005B2C60 /* cli.c */, E97577072213148800D1EF74 /* e2e.t */, E9B43DE224619D7E00824E51 /* fusion.c */, - E9B43E0B24689E8900824E51 /* fusionbench.c */, 106530E91D9B7C13005B2C60 /* picotls.c */, 1059003D1DC8D4E300FB4085 /* minicrypto.c */, 106530C51D9B1A98005B2C60 /* openssl.c */, @@ -805,23 +781,6 @@ productReference = E9B43DE124619D5100824E51 /* test-fusion */; productType = "com.apple.product-type.tool"; }; - E9B43DE824689E5900824E51 /* fusionbench */ = { - isa = PBXNativeTarget; - buildConfigurationList = E9B43E0724689E5900824E51 /* Build configuration list for PBXNativeTarget "fusionbench" */; - buildPhases = ( - E9B43DE924689E5900824E51 /* Sources */, - E9B43E0524689E5900824E51 /* Frameworks */, - E9B43E0624689E5900824E51 /* CopyFiles */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = fusionbench; - productName = "test-crypto-openssl"; - productReference = E9B43E0A24689E5900824E51 /* fusionbench */; - productType = "com.apple.product-type.tool"; - }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ @@ -860,7 +819,6 @@ 105900411DC8D57000FB4085 /* test-minicrypto */, E992F79B20E99A7C0008154D /* picotls-esni */, E9B43DC024619D5100824E51 /* test-fusion */, - E9B43DE824689E5900824E51 /* fusionbench */, ); }; /* End PBXProject section */ @@ -1011,17 +969,6 @@ ); runOnlyForDeploymentPostprocessing = 0; }; - E9B43DE924689E5900824E51 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - E9B43E0E24689F9500824E51 /* picotls-probes.d in Sources */, - E9B43E0F24689FAC00824E51 /* picotls.c in Sources */, - E9B43E0D24689EDA00824E51 /* fusion.c in Sources */, - E9B43E0C24689E8900824E51 /* fusionbench.c in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; /* End PBXSourcesBuildPhase section */ /* Begin PBXTargetDependency section */ @@ -1355,26 +1302,6 @@ }; name = Release; }; - E9B43E0824689E5900824E51 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; - OTHER_CFLAGS = "-march=native"; - OTHER_LDFLAGS = ""; - PRODUCT_NAME = "$(TARGET_NAME)"; - }; - name = Debug; - }; - E9B43E0924689E5900824E51 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)"; - OTHER_CFLAGS = "-march=native"; - OTHER_LDFLAGS = ""; - PRODUCT_NAME = "$(TARGET_NAME)"; - }; - name = Release; - }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ @@ -1459,15 +1386,6 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; - E9B43E0724689E5900824E51 /* Build configuration list for PBXNativeTarget "fusionbench" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - E9B43E0824689E5900824E51 /* Debug */, - E9B43E0924689E5900824E51 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; /* End XCConfigurationList section */ }; rootObject = 106530AA1D9985E0005B2C60 /* Project object */; diff --git a/t/fusionbench.c b/t/fusionbench.c deleted file mode 100644 index 0599fefe0..000000000 --- a/t/fusionbench.c +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include -#include -#include "picotls/fusion.h" - -int main(int argc, char **argv) -{ - static const uint8_t key[16] = {}, aad[13] = {}; - size_t textlen = 16384; - ptls_aead_supplementary_encryption_t *supp = NULL; - int ch, decrypt = 0, count = 1000000; - - while ((ch = getopt(argc, argv, "b:dn:sh")) != -1) { - switch (ch) { - case 'b': - if (sscanf(optarg, "%zu", &textlen) != 1) { - fprintf(stderr, "failed to parse the number of bytes given by `-b`\n"); - exit(1); - } - break; - case 'd': - decrypt = 1; - break; - case 'n': - if (sscanf(optarg, "%d", &count) != 1) { - fprintf(stderr, "failed to parse the number given by `-n`\n"); - exit(1); - } - break; - case 's': { - static const uint8_t k[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - supp = malloc(sizeof(*supp)); - supp->ctx = ptls_cipher_new(&ptls_fusion_aes128ctr, 1, k); - } break; - default: - printf("Usage: %s -b -s\n" - "Options:\n" - " -b specifies the size of the AEAD payload\n" - " -d test decryption\n" - " -n number of iterations\n" - " -s if set, runs the benchmark with supplemental vector\n", - argv[0]); - return 0; - } - } - argc -= optind; - argv += optind; - - uint8_t *text = malloc(textlen + 16); - memset(text, 0, textlen + 16); - if (supp != NULL) - supp->input = textlen >= 2 ? text + 2 : text + textlen; - - ptls_fusion_aesgcm_context_t *ctx = ptls_fusion_aesgcm_new(key, sizeof(key), sizeof(aad) + textlen); - - if (!decrypt) { - for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_encrypt(ctx, text, text, textlen, _mm_setzero_si128(), aad, sizeof(aad), supp); - } else { - uint8_t tag[16] = {}; - for (int i = 0; i < count; ++i) - ptls_fusion_aesgcm_decrypt(ctx, text, text, textlen, _mm_setzero_si128(), aad, sizeof(aad), &tag); - } - - for (int i = 0; i < 16; ++i) - printf("%02x", text[i]); - printf("\n"); - for (int i = 0; i < 16; ++i) - printf("%02x", text[textlen + i]); - printf("\n"); - - return 0; -}