diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index fe32c6c968d..47326731afe 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -183,9 +183,12 @@ if(ARCH STREQUAL "aarch64") test/trampoline-armv8.${ASM_EXT} cipher_extra/chacha20_poly1305_armv8.${ASM_EXT} - fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S - fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S - fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S + fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S + fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S + fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S + fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S + fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S + fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S ) endif() diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S new file mode 100644 index 00000000000..06ef696de17 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S @@ -0,0 +1,828 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + ldr q16, [x6, #16] // load h2k | h1k + ldr q17, [x6, #64] // load h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S new file mode 100644 index 00000000000..89468b97e96 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S @@ -0,0 +1,828 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_192 +.hidden aes_gcm_dec_kernel_slothy_base_192 +.type aes_gcm_dec_kernel_slothy_base_192,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_192 +.private_extern _aes_gcm_dec_kernel_slothy_base_192 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +.align 4 +_aes_gcm_dec_kernel_slothy_base_192: +aes_gcm_dec_kernel_slothy_base_192: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + ldr q16, [x6, #16] // load h2k | h1k + ldr q17, [x6, #64] // load h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S new file mode 100644 index 00000000000..d3008cd8667 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S @@ -0,0 +1,828 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_256 +.hidden aes_gcm_dec_kernel_slothy_base_256 +.type aes_gcm_dec_kernel_slothy_base_256,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_256 +.private_extern _aes_gcm_dec_kernel_slothy_base_256 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +.align 4 +_aes_gcm_dec_kernel_slothy_base_256: +aes_gcm_dec_kernel_slothy_base_256: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + ldr q16, [x6, #16] // load h2k | h1k + ldr q17, [x6, #64] // load h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_basic.S rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_basic.S rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_basic.S rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S similarity index 79% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S index d23c2e41840..377083c7000 100644 --- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S +++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S @@ -39,17 +39,17 @@ #include .arch armv8-a+crypto .text -.globl aes_gcm_enc_kernel_slothy_base_192 -.hidden aes_gcm_enc_kernel_slothy_base_192 -.type aes_gcm_enc_kernel_slothy_base_192,%function +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function #elif defined(__APPLE__) #if defined(BORINGSSL_PREFIX) #include #endif #include .text -.globl _aes_gcm_enc_kernel_slothy_base_192 -.private_extern _aes_gcm_enc_kernel_slothy_base_192 +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 #else #error Unknown configuration #endif @@ -74,17 +74,11 @@ full_blocks .req x7 remainder .req x9 unroll .req x10 -aes_st0 .req v0 -aes_st0_q .req q0 +aes_st .req v0 +aes_st_q .req q0 -aes_st1 .req v2 -aes_st1_q .req q2 - -res0 .req v0 -res0_q .req q0 - -res1 .req v2 -res1_q .req q2 +res .req v0 +res_q .req q0 ghash_hi .req v9 ghash_lo .req v8 @@ -144,9 +138,6 @@ rk8q .req q26 rk9q .req q27 rk10q .req q28 -rk11q .req q15 -rk12q .req q16 - rk0 .req v18 rk1 .req v19 rk2 .req v20 @@ -159,9 +150,6 @@ rk8 .req v26 rk9 .req v27 rk10 .req v28 -rk11 .req v15 -rk12 .req v16 - plain .req v29 plain_q .req q29 @@ -217,10 +205,14 @@ tag_q .req q11 ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] .endm +// Derive number of iterations of unrolled loop and single-block loop .macro prepare_loop_counts mov unroll, #UNROLL + // Number of AES Blocks (16b each) lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each msub remainder, count, unroll, full_blocks .endm @@ -231,17 +223,20 @@ tag_q .req q11 .macro load_iv ldr rtmp_ctr_q, [ivec] - mov constant_temp, #0x100000000 // set up counter increment + // set up counter increment + mov constant_temp, #0x100000000 movi rctr_inc.16b, #0x0 fmov rctr_inc.d[1], constant_temp rev32 rtmp_ctr.16b, rtmp_ctr.16b .endm +// Increase AES counter .macro aes_ctr_inc add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s .endm +// Increase AES counter and initialize new AES state .macro next_ctr_init_aes aes_st rev32 \aes_st\().16b, rtmp_ctr.16b aes_ctr_inc @@ -283,14 +278,13 @@ tag_q .req q11 .endm .macro aesr_final aes_st, plain, out - aese \aes_st\().16b, rk11.16b - eor3 \out\().16b, \plain\().16b, rk12.16b, \aes_st\().16b + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b .endm .macro aes_full_block aes_st, input, output next_ctr_init_aes \aes_st - aesr_0_8 \aes_st\(), rk - aesr_9_10 \aes_st\(), rk + aesr_0_8 \aes_st\(), rk aesr_final \aes_st, \input, \output .endm @@ -310,8 +304,6 @@ tag_q .req q11 load_round_key 8 load_round_key 9 load_round_key 10 - load_round_key 11 - load_round_key 12 .endm /********************************************************************/ @@ -408,52 +400,6 @@ tag_q .req q11 /* Macros for GHASH udpate */ /********************************************************************/ -.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - eor \inputa\().16b, \inputa\().16b, tag.16b - - // Low product - pmull ghash_lo.1q, \inputa\().1d, \Ha\().1d - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_hi.1q, \inputa\().2d, \Ha\().2d - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b -.endm - -.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - - // Low product - pmull ghash_tmp.1q, \inputa\().1d, \Ha\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_tmp.1q, \inputa\().2d, \Ha\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d - eor ghash_mid.16b, ghash_mid.16b, \inputa\().16b - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b -.endm - .macro ghash_init_0 input, Hk, Hk_mid, tag rev64 \input\().16b, \input\().16b eor \input\().16b, \input\().16b, \tag\().16b @@ -541,19 +487,19 @@ tag_q .req q11 /********************************************************************/ .align 4 -_aes_gcm_enc_kernel_slothy_base_192: -aes_gcm_enc_kernel_slothy_base_192: +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,_BORINGSSL_function_hit@PAGE add x9, x9, _BORINGSSL_function_hit@PAGEOFF mov w10, #1 - strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel #endif - AARCH64_SIGN_LINK_REGISTER + AARCH64_VALID_CALL_TARGET sub sp, sp, #STACK_SIZE -Lenc_preamble_start: +Ldec_preamble_start: save_gprs save_vregs @@ -566,33 +512,37 @@ Lenc_preamble_start: prepare_loop_counts prepare_ghash -Lenc_preamble_end: +Ldec_preamble_end: cbz count, Lloop_unrolled_end Lloop_unrolled_start: + ldr plain_q, [input], #(4*16) - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #(4*16) + aes_full_block aes_st, plain, res + str res_q, [output], #(4*16) + + load_htable_34 + ghash_init_1 plain, Ht4, Ht34, tag ldr plain_q, [input, #(-3*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-3*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-3*16)] - load_htable_34 - ghash_init_pair res0, res1, Ht4, Ht3, Ht34 + ghash_acc_0 plain, Ht3, Ht34 ldr plain_q, [input, #(-2*16)] - aes_full_block aes_st0, plain, res0 - str res0_q, [output, #(-2*16)] - - ldr plain_q, [input, #(-1*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-1*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-2*16)] load_htable_12 - ghash_acc_pair res0, res1, Ht2, Ht1, Ht12 + ghash_acc_1 plain, Ht2, Ht12 + + ldr plain_q, [input, #(-1*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-1*16)] + ghash_acc_0 plain, Ht1, Ht12 ghash_finalize tag sub count, count, #1 @@ -605,10 +555,10 @@ Lloop_unrolled_end: Lloop_1x_start: ldr plain_q, [input], #16 - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #16 - ghash_init_0 res0, Ht1, Ht12, tag + aes_full_block aes_st, plain, res + str res_q, [output], #16 + ghash_init_0 plain, Ht1, Ht12, tag ghash_finalize tag sub remainder, remainder, #1 @@ -627,10 +577,9 @@ Lloop_1x_end: restore_vregs restore_gprs -Lenc_postamble_end: +Ldec_postamble_end: add sp, sp, #STACK_SIZE - AARCH64_VALIDATE_LINK_REGISTER ret #endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S new file mode 100644 index 00000000000..250722b10ff --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S @@ -0,0 +1,587 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +aes_st0 .req v0 +aes_st0_q .req q0 +aes_st1 .req v1 +aes_st1_q .req q1 +aes_st2 .req v2 +aes_st2_q .req q2 +aes_st3 .req v3 +aes_st3_q .req q3 + +res0 .req v0 +res0_q .req q0 +res1 .req v1 +res1_q .req q1 +res2 .req v2 +res2_q .req q2 +res3 .req v3 +res3_q .req q3 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht3q .req q15 +Ht4q .req q16 +Ht34q .req q17 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3 .req v15 +Ht4 .req v16 +Ht34 .req v17 + +Ht5q .req Ht3q +Ht6q .req Ht4q +Ht56q .req Ht34q + +Ht5 .req Ht3 +Ht6 .req Ht4 +Ht56 .req Ht34 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +rctr_inc .req v30 +rtmp_ctr .req v31 +rtmp_ctr_q .req q31 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldr rtmp_ctr_q, [ivec] + + mov constant_temp, #0x100000000 // set up counter increment + movi rctr_inc.16b, #0x0 + fmov rctr_inc.d[1], constant_temp + + rev32 rtmp_ctr.16b, rtmp_ctr.16b +.endm + +.macro aes_ctr_inc + add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s +.endm + +.macro next_ctr_init_aes_st aes_st + rev32 \aes_st\().16b, rtmp_ctr.16b + aes_ctr_inc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_x4 data0, data1, data2, data3, key + aesr \data0\(), \key\() + aesr \data1\(), \key\() + aesr \data2\(), \key\() + aesr \data3\(), \key\() +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_0_8_x4 data0, data1, data2, data3, key + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()0.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()1.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()2.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()3.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()4.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()5.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()6.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()7.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro load_round_key i + ldr rk\()\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro prepare_loop_counts + mov unroll, #UNROLL + lsr full_blocks, byte_len, #4 + udiv count, full_blocks, unroll + msub remainder, count, unroll, full_blocks +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_SIGN_LINK_REGISTER + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + + load_htable_34 + load_htable_12 + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + next_ctr_init_aes_st aes_st0 + next_ctr_init_aes_st aes_st1 + next_ctr_init_aes_st aes_st2 + next_ctr_init_aes_st aes_st3 + + aesr_0_8 aes_st0, rk + aesr_0_8 aes_st1, rk + aesr_0_8 aes_st2, rk + aesr_0_8 aes_st3, rk + + ldr plain_q, [input], #(4*16) + aesr_final aes_st0, plain, res0 + str res0_q, [output], #(4*16) + + ghash_init_1 plain, Ht4, Ht34, tag + + ldr plain_q, [input, #(-3*16)] + aesr_final aes_st1, plain, res1 + str res1_q, [output, #(-3*16)] + + ghash_acc_0 plain, Ht3, Ht34 + + ldr plain_q, [input, #(-2*16)] + aesr_final aes_st2, plain, res2 + str res2_q, [output, #(-2*16)] + + ghash_acc_1 plain, Ht2, Ht12 + + ldr plain_q, [input, #(-1*16)] + aesr_final aes_st3, plain, res3 + str res3_q, [output, #(-1*16)] + + ghash_acc_0 plain, Ht1, Ht12 + ghash_finalize tag + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + next_ctr_init_aes_st aes_st0 + aesr_0_8 aes_st0, rk + + ldr plain_q, [input], #16 + aesr_final aes_st0, plain, res0 + str res0_q, [output], #16 + + ghash_init_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + // Store updated counter + rev32 rtmp_ctr.16b, rtmp_ctr.16b + str rtmp_ctr_q, [ivec] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S new file mode 100644 index 00000000000..2e352527c7b --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S @@ -0,0 +1,598 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:writes=stack_0 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:writes=stack_1 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:writes=stack_2 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:writes=stack_3 + + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + rev ctr_tmp_w, ctr + str ctr_tmp_w, [sp, #(STACK_BASE_AES_ST + \loc*16 + 12)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc + add ctr, ctr, #1 +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold=true + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + ldr plain_q, [input], #(4*16) + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #(4*16) + + load_htable_34 + ghash_init_1 plain, Ht4, Ht34, tag + + ldr plain_q, [input, #(-3*16)] + aes_full_block aes_st, plain, res, 1 + str res_q, [output, #(-3*16)] + + ghash_acc_0 plain, Ht3, Ht34 + + ldr plain_q, [input, #(-2*16)] + aes_full_block aes_st, plain, res, 2 + str res_q, [output, #(-2*16)] + + load_htable_12 + ghash_acc_1 plain, Ht2, Ht12 + + ldr plain_q, [input, #(-1*16)] + aes_full_block aes_st, plain, res, 3 + str res_q, [output, #(-1*16)] + + ghash_acc_0 plain, Ht1, Ht12 + + ghash_finalize tag + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + // rev32 rtmp_ctr.16b, rtmp_ctr.16b + // str rtmp_ctr_q, [ivec] + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S new file mode 100644 index 00000000000..5516f11a5d9 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S @@ -0,0 +1,650 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v30 +tag_q .req q30 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + add ctr_tmp_w, ctr, #\loc + rev ctr_tmp_w, ctr_tmp_w + orr ctr_tmp, ivec_64_96, ctr_tmp, lsl #32 + stp ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold=true + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + load_htable_12 + + ldr plain_q, [input, #(3*16)] + aes_full_block aes_st, plain, res, 3 + str res_q, [output, #(3*16)] + + ghash_init_0 plain, Ht1, Ht12 + + ldr plain_q, [input, #(2*16)] + aes_full_block aes_st, plain, res, 2 + str res_q, [output, #(2*16)] + + ghash_acc_1 plain, Ht2, Ht12 + + load_htable_34 + + ldr plain_q, [input, #(1*16)] + aes_full_block aes_st, plain, res, 1 + str res_q, [output, #(1*16)] + + ghash_acc_0 plain, Ht3, Ht34 + + ldr plain_q, [input], #(4*16) + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #(4*16) + + ghash_acc_with_tag_1 plain, Ht4, Ht34, tag + ghash_finalize tag + + add ctr, ctr, #UNROLL + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_with_tag_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + add ctr, ctr, #1 + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S new file mode 100644 index 00000000000..b8b2e0f480b --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S @@ -0,0 +1,633 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x16 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req q15 +Ht4q .req q16 +Ht34q .req q17 + +Ht3 .req v15 +Ht4 .req v16 +Ht34 .req v17 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v30 +tag_q .req q30 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + add ctr_tmp_w, ctr, #\loc + rev ctr_tmp_w, ctr_tmp_w + orr ctr_tmp, ivec_64_96, ctr_tmp, lsl #32 + stp ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold=true + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + + load_htable_12 + load_htable_34 + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + ldr plain_q, [input, #(3*16)] + aes_full_block aes_st, plain, res, 3 + str res_q, [output, #(3*16)] + + ghash_init_0 plain, Ht1, Ht12 + + ldr plain_q, [input, #(2*16)] + aes_full_block aes_st, plain, res, 2 + str res_q, [output, #(2*16)] + + ghash_acc_1 plain, Ht2, Ht12 + + ldr plain_q, [input, #(1*16)] + aes_full_block aes_st, plain, res, 1 + str res_q, [output, #(1*16)] + + ghash_acc_0 plain, Ht3, Ht34 + + ldr plain_q, [input], #(4*16) + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #(4*16) + + ghash_acc_with_tag_1 plain, Ht4, Ht34, tag + ghash_finalize tag + + add ctr, ctr, #UNROLL + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_with_tag_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + add ctr, ctr, #1 + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + // rev32 rtmp_ctr.16b, rtmp_ctr.16b + // str rtmp_ctr_q, [ivec] + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S similarity index 81% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S index 4a74ddb91e8..d0d57944901 100644 --- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S +++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S @@ -74,17 +74,11 @@ full_blocks .req x7 remainder .req x9 unroll .req x10 -aes_st0 .req v0 -aes_st0_q .req q0 +aes_st .req v0 +aes_st_q .req q0 -aes_st1 .req v2 -aes_st1_q .req q2 - -res0 .req v0 -res0_q .req q0 - -res1 .req v2 -res1_q .req q2 +res .req v0 +res_q .req q0 ghash_hi .req v9 ghash_lo .req v8 @@ -108,13 +102,29 @@ Ht1 .req v12 Ht2 .req v13 Ht12 .req v14 -Ht3q .req q15 -Ht4q .req q16 -Ht34q .req q17 +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q -Ht3 .req v15 -Ht4 .req v16 -Ht34 .req v17 +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 rk0q .req q18 rk1q .req q19 @@ -195,10 +205,14 @@ tag_q .req q11 ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] .endm +// Derive number of iterations of unrolled loop and single-block loop .macro prepare_loop_counts mov unroll, #UNROLL + // Number of AES Blocks (16b each) lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each msub remainder, count, unroll, full_blocks .endm @@ -209,17 +223,20 @@ tag_q .req q11 .macro load_iv ldr rtmp_ctr_q, [ivec] - mov constant_temp, #0x100000000 // set up counter increment + // set up counter increment + mov constant_temp, #0x100000000 movi rctr_inc.16b, #0x0 fmov rctr_inc.d[1], constant_temp rev32 rtmp_ctr.16b, rtmp_ctr.16b .endm +// Increase AES counter .macro aes_ctr_inc add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s .endm +// Increase AES counter and initialize new AES state .macro next_ctr_init_aes aes_st rev32 \aes_st\().16b, rtmp_ctr.16b aes_ctr_inc @@ -272,7 +289,7 @@ tag_q .req q11 .endm .macro load_round_key i - ldr rk\()\i\()q, [key, #((\i)*16)] + ldr rk\i\()q, [key, #((\i)*16)] .endm .macro load_round_keys @@ -311,6 +328,22 @@ tag_q .req q11 ldr \dst_q, [Htable, #80] .endm +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + .macro load_h12 dst, dst_q ldr \dst_q, [Htable, #16] .endm @@ -319,13 +352,24 @@ tag_q .req q11 ldr \dst_q, [Htable, #64] .endm +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + .macro load_full_htable load_h1 Ht1, Ht1q load_h2 Ht2, Ht2q load_h3 Ht3, Ht3q load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q load_h12 Ht12, Ht12q load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q .endm .macro load_htable_12 @@ -340,56 +384,22 @@ tag_q .req q11 load_h34 Ht34, Ht34q .endm -/********************************************************************/ -/* Macros for GHASH udpate */ -/********************************************************************/ - -.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - eor \inputa\().16b, \inputa\().16b, tag.16b - - // Low product - pmull ghash_lo.1q, \inputa\().1d, \Ha\().1d - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_hi.1q, \inputa\().2d, \Ha\().2d - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q .endm -.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - - // Low product - pmull ghash_tmp.1q, \inputa\().1d, \Ha\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_tmp.1q, \inputa\().2d, \Ha\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d - eor ghash_mid.16b, ghash_mid.16b, \inputa\().16b - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q .endm +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + .macro ghash_init_0 input, Hk, Hk_mid, tag rev64 \input\().16b, \input\().16b eor \input\().16b, \input\().16b, \tag\().16b @@ -486,7 +496,7 @@ aes_gcm_enc_kernel_slothy_base_128: strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel #endif - AARCH64_SIGN_LINK_REGISTER + AARCH64_VALID_CALL_TARGET sub sp, sp, #STACK_SIZE Lenc_preamble_start: @@ -499,9 +509,6 @@ Lenc_preamble_start: load_tag load_iv - load_htable_34 - load_htable_12 - prepare_loop_counts prepare_ghash @@ -511,24 +518,30 @@ Lenc_preamble_end: Lloop_unrolled_start: ldr plain_q, [input], #(4*16) - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #(4*16) + aes_full_block aes_st, plain, res + str res_q, [output], #(4*16) + + load_htable_34 + ghash_init_1 res, Ht4, Ht34, tag ldr plain_q, [input, #(-3*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-3*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-3*16)] - ghash_init_pair res0, res1, Ht4, Ht3, Ht34 + ghash_acc_0 res, Ht3, Ht34 ldr plain_q, [input, #(-2*16)] - aes_full_block aes_st0, plain, res0 - str res0_q, [output, #(-2*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-2*16)] + + load_htable_12 + ghash_acc_1 res, Ht2, Ht12 ldr plain_q, [input, #(-1*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-1*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-1*16)] - ghash_acc_pair res0, res1, Ht2, Ht1, Ht12 + ghash_acc_0 res, Ht1, Ht12 ghash_finalize tag @@ -536,13 +549,15 @@ Lloop_unrolled_start: cbnz count, Lloop_unrolled_start Lloop_unrolled_end: + load_htable_12 + cbz remainder, Lloop_1x_end Lloop_1x_start: ldr plain_q, [input], #16 - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #16 - ghash_init_0 res0, Ht1, Ht12, tag + aes_full_block aes_st, plain, res + str res_q, [output], #16 + ghash_init_0 res, Ht1, Ht12, tag ghash_finalize tag @@ -565,7 +580,6 @@ Lloop_1x_end: Lenc_postamble_end: add sp, sp, #STACK_SIZE - AARCH64_VALIDATE_LINK_REGISTER ret #endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_ilp.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_ilp.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_ilp.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable_rotate.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable_rotate.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable_rotate.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_full.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_full.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_full.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_full.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_basic.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_basic.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_basic.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_ilp.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_ilp.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_ilp.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_basic.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_basic.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_basic.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S new file mode 100644 index 00000000000..f78f8146a43 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S @@ -0,0 +1,716 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel_slothy_base_128 +.hidden aes_gcm_enc_kernel_slothy_base_128 +.type aes_gcm_enc_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_enc_kernel_slothy_base_128 +.private_extern _aes_gcm_enc_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +aes_st0 .req v0 +aes_st0_q .req q0 +aes_st1 .req v1 +aes_st1_q .req q1 +aes_st2 .req v2 +aes_st2_q .req q2 +aes_st3 .req v3 +aes_st3_q .req q3 +aes_st4 .req v20 +aes_st4_q .req q20 +aes_st5 .req v21 +aes_st5_q .req q21 +aes_st6 .req v22 +aes_st6_q .req q22 +aes_st7 .req v23 +aes_st7_q .req q23 + +res0 .req v4 +res0_q .req q4 +res1 .req v5 +res1_q .req q5 +res2 .req v6 +res2_q .req q6 +res3 .req v24 +res3_q .req q24 +res4 .req v25 +res4_q .req q25 +res5 .req v26 +res5_q .req q26 +res6 .req v27 +res6_q .req q27 +res7 .req v28 +res7_q .req q28 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 + +ghash_hi0 .req ghash_hi +ghash_lo0 .req ghash_lo +ghash_mid0 .req ghash_mid + +ghash_hi1 .req v15 +ghash_lo1 .req v16 +ghash_mid1 .req v17 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q18 //q20 +rk3q .req q19 //q21 +rk4q .req q18 //q22 +rk5q .req q19 //q23 +rk6q .req q18 //q24 +rk7q .req q19 //q25 +rk8q .req q18 //q26 +rk9q .req q19 //q27 +rk10q .req q18 //q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v18 //v20 +rk3 .req v19 //v21 +rk4 .req v18 //v22 +rk5 .req v19 //v23 +rk6 .req v18 //v24 +rk7 .req v19 //v25 +rk8 .req v18 //v26 +rk9 .req v19 //v27 +rk10 .req v18 //v28 + +plain0 .req res0 +plain0_q .req res0_q +plain1 .req res1 +plain1_q .req res1_q +plain2 .req res2 +plain2_q .req res2_q +plain3 .req res3 +plain3_q .req res3_q +plain4 .req res4 +plain4_q .req res4_q +plain5 .req res5 +plain5_q .req res5_q +plain6 .req res6 +plain6_q .req res6_q +plain7 .req res7 +plain7_q .req res7_q + +rctr_inc .req v30 +rtmp_ctr .req v31 +rtmp_ctr_q .req q31 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 8 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro prepare_loop_counts + mov unroll, #UNROLL + lsr full_blocks, byte_len, #4 + udiv count, full_blocks, unroll + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldr rtmp_ctr_q, [ivec] + + mov constant_temp, #0x100000000 // set up counter increment + movi rctr_inc.16b, #0x0 + fmov rctr_inc.d[1], constant_temp + + rev32 rtmp_ctr.16b, rtmp_ctr.16b +.endm + +.macro aes_ctr_inc + add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s +.endm + +.macro next_ctr_init_aes aes_st + rev32 \aes_st\().16b, rtmp_ctr.16b + aes_ctr_inc +.endm + +.macro next_ctr_init_aes_x8 aes_st + next_ctr_init_aes \aes_st\()0 + next_ctr_init_aes \aes_st\()1 + next_ctr_init_aes \aes_st\()2 + next_ctr_init_aes \aes_st\()3 + next_ctr_init_aes \aes_st\()4 + next_ctr_init_aes \aes_st\()5 + next_ctr_init_aes \aes_st\()6 + next_ctr_init_aes \aes_st\()7 +.endm + +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + + +.macro aesr_x4 st0, st1, st2, st3, rk // @slothy:no-unfold + aesr \st0, \rk + aesr \st1, \rk + aesr \st2, \rk + aesr \st3, \rk +.endm + +.macro aese_x4 st0, st1, st2, st3, rk // @slothy:no-unfold + aese \st0, \rk + aese \st1, \rk + aese \st2, \rk + aese \st3, \rk +.endm + +.macro aesr_x8 i + load_round_key \i + aesr_x4 aes_st0.16b, aes_st1.16b, aes_st2.16b, aes_st3.16b, rk\i\().16b + aesr_x4 aes_st4.16b, aes_st5.16b, aes_st6.16b, aes_st7.16b, rk\i\().16b +.endm + +.macro aese_x8 i + load_round_key \i + aese_x4 aes_st0.16b, aes_st1.16b, aes_st2.16b, aes_st3.16b, rk\i\().16b + aese_x4 aes_st4.16b, aes_st5.16b, aes_st6.16b, aes_st7.16b, rk\i\().16b +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \plain\().16b, rk10.16b, \aes_st\().16b +.endm + +// Load i-th round key +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid, i + // Low product + pmull ghash_lo\i\().1q, \inputa\().1d, \Ha\().1d + pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d + eor ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b + // High product + pmull2 ghash_hi\i\().1q, \inputa\().2d, \Ha\().2d + pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d + eor ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b + // Middle product + trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d + trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d + eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b + pmull2 ghash_mid\i\().1q, ghash_tmp.2d, \Hk_mid\().2d + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid\i\().16b, ghash_mid\i\().16b, ghash_tmp.16b +.endm + +.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid, i + // Low product + pmull ghash_tmp.1q, \inputa\().1d, \Ha\().1d + eor ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b + pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d + eor ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \inputa\().2d, \Ha\().2d + eor ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b + pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d + eor ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b + // Middle product + trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d + trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d + eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b + pmull2 \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid\i\().16b, ghash_mid\i\().16b, \inputa\().16b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid\i\().16b, ghash_mid\i\().16b, ghash_tmp.16b +.endm + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro store_0_8 res + stp \res\()0_q, \res\()1_q, [output], #(8*16) + stp \res\()2_q, \res\()3_q, [output, #(-6*16)] + stp \res\()4_q, \res\()5_q, [output, #(-4*16)] + stp \res\()6_q, \res\()7_q, [output, #(-2*16)] + rev64 \res\()0.16b, \res\()0.16b + rev64 \res\()1.16b, \res\()1.16b + rev64 \res\()2.16b, \res\()2.16b + rev64 \res\()3.16b, \res\()3.16b + rev64 \res\()4.16b, \res\()4.16b + rev64 \res\()5.16b, \res\()5.16b + rev64 \res\()6.16b, \res\()6.16b + rev64 \res\()7.16b, \res\()7.16b +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_enc_kernel_slothy_base_128: +aes_gcm_enc_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel +#endif + + AARCH64_SIGN_LINK_REGISTER + sub sp, sp, #STACK_SIZE + +Lenc_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Lenc_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + next_ctr_init_aes aes_st0 // @slothy:pre=true + next_ctr_init_aes aes_st1 // @slothy:pre=true + next_ctr_init_aes aes_st2 // @slothy:pre=true + next_ctr_init_aes aes_st3 // @slothy:pre=true + next_ctr_init_aes aes_st4 // @slothy:pre=true + next_ctr_init_aes aes_st5 // @slothy:pre=true + next_ctr_init_aes aes_st6 // @slothy:pre=true + next_ctr_init_aes aes_st7 + + aesr_x8 0 // @slothy:core=true + aesr_x8 1 // @slothy:core=true + aesr_x8 2 // @slothy:core=true + aesr_x8 3 // @slothy:core=true + aesr_x8 4 // @slothy:core=true + aesr_x8 5 // @slothy:core=true + aesr_x8 6 // @slothy:core=true + aesr_x8 7 // @slothy:core=true + aesr_x8 8 // @slothy:core=true + aese_x8 9 // @slothy:core=true + + load_round_key 10 + ldp plain0_q, plain1_q, [input], #(8*16) + eor3 res0.16b, plain0.16b, rk10.16b, aes_st0.16b // @slothy:core=true + eor3 res1.16b, plain1.16b, rk10.16b, aes_st1.16b // @slothy:core=true + ldp plain2_q, plain3_q, [input, #(-6*16)] + eor3 res2.16b, plain2.16b, rk10.16b, aes_st2.16b // @slothy:core=true + eor3 res3.16b, plain3.16b, rk10.16b, aes_st3.16b // @slothy:core=true + ldp plain4_q, plain5_q, [input, #(-4*16)] + eor3 res4.16b, plain4.16b, rk10.16b, aes_st4.16b // @slothy:core=true + eor3 res5.16b, plain5.16b, rk10.16b, aes_st5.16b // @slothy:core=true + ldp plain6_q, plain7_q, [input, #(-2*16)] + eor3 res6.16b, plain6.16b, rk10.16b, aes_st6.16b // @slothy:core=true + eor3 res7.16b, plain7.16b, rk10.16b, aes_st7.16b // @slothy:core=true + store_0_8 res + + eor res0.16b, res0.16b, tag.16b + + load_htable_78 + ghash_init_pair res0, res1, Ht8, Ht7, Ht78, 0 // @slothy:post=true + load_htable_56 + ghash_init_pair res2, res3, Ht6, Ht5, Ht56, 1 // @slothy:post=true + load_htable_34 + ghash_acc_pair res4, res5, Ht4, Ht3, Ht34, 0 // @slothy:post=true + load_htable_12 + ghash_acc_pair res6, res7, Ht2, Ht1, Ht12, 1 // @slothy:post=true + + eor ghash_lo.16b, ghash_lo0.16b, ghash_lo1.16b + eor ghash_hi.16b, ghash_hi0.16b, ghash_hi1.16b + eor ghash_mid.16b, ghash_mid0.16b, ghash_mid1.16b + + ghash_finalize tag // @slothy:post=true + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + next_ctr_init_aes aes_st0 + + load_round_key 0 + aesr aes_st0.16b, rk0.16b + + load_round_key 1 + aesr aes_st0.16b, rk1.16b + + load_round_key 2 + aesr aes_st0.16b, rk2.16b + + load_round_key 3 + aesr aes_st0.16b, rk3.16b + + load_round_key 4 + aesr aes_st0.16b, rk4.16b + + load_round_key 5 + aesr aes_st0.16b, rk5.16b + + load_round_key 6 + aesr aes_st0.16b, rk6.16b + + load_round_key 7 + aesr aes_st0.16b, rk7.16b + + load_round_key 8 + aesr aes_st0.16b, rk8.16b + + load_round_key 9 + aese aes_st0.16b, rk9.16b + + load_round_key 10 + ldr plain0_q, [input], #16 + eor3 res0.16b, plain0.16b, rk10.16b, aes_st0.16b + str res0_q, [output], #16 + + ghash_init_0 res0, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + // Store updated counter + rev32 rtmp_ctr.16b, rtmp_ctr.16b + str rtmp_ctr_q, [ivec] + + restore_vregs + restore_gprs + +Lenc_postamble_end: + add sp, sp, #STACK_SIZE + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_manual_eor3.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_manual_eor3.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_manual_eor3.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_manual_eor3.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S new file mode 100644 index 00000000000..6e891d62a9c --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S @@ -0,0 +1,752 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel_slothy_base_128 +.hidden aes_gcm_enc_kernel_slothy_base_128 +.type aes_gcm_enc_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_enc_kernel_slothy_base_128 +.private_extern _aes_gcm_enc_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x16 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req q15 +Ht4q .req q16 +Ht34q .req q17 + +Ht3 .req v15 +Ht4 .req v16 +Ht34 .req v17 + +Ht5q .req q15 +Ht6q .req q16 +Ht56q .req q17 + +Ht5 .req v15 +Ht6 .req v16 +Ht56 .req v17 + +Ht7q .req q15 +Ht8q .req q16 +Ht78q .req q17 + +Ht7 .req v15 +Ht8 .req v16 +Ht78 .req v17 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk10_lo .req x20 +rk10_hi .req x21 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +plain_lo .req x22 +plain_hi .req x23 + +tag .req v30 +tag_q .req q30 + +#define UNROLL 8 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + add ctr_tmp_w, ctr, #\loc + rev ctr_tmp_w, ctr_tmp_w + orr ctr_tmp, ivec_64_96, ctr_tmp, lsl #32 + stp ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold=true + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out, loc + aese \aes_st\().16b, rk9.16b + eor \plain\()_lo, \plain\()_lo, rk10_lo + eor \plain\()_hi, \plain\()_hi, rk10_hi + stp \plain\()_lo, \plain\()_hi, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \plain\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc + eor \out\().16b, \plain\().16b, \aes_st\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output, \loc +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_key_scalar i + ldp rk\i\()_lo, rk\i\()_hi, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key_scalar 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_enc_kernel_slothy_base_128: +aes_gcm_enc_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Lenc_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Lenc_preamble_end: + + cbz count, Lloop_unrolled_end + + ldp plain_lo, plain_hi, [input, #(7*16)] + aes_full_block aes_st, plain, res, 7 + str res_q, [output, #(7*16)] + + load_htable_12 + ghash_init_0 res, Ht1, Ht12 + + ldp plain_lo, plain_hi, [input, #(6*16)] + aes_full_block aes_st, plain, res, 6 + str res_q, [output, #(6*16)] + + ghash_acc_1 res, Ht2, Ht12 + + ldp plain_lo, plain_hi, [input, #(5*16)] + aes_full_block aes_st, plain, res, 5 + str res_q, [output, #(5*16)] + + load_htable_34 + ghash_acc_0 res, Ht3, Ht34 + + sub count, count, #1 + cbz count, Lloop_unrolled_core_end +Lloop_unrolled_start: + + ldp plain_lo, plain_hi, [input, #(4*16)] + aes_full_block aes_st, plain, res, 4 + str res_q, [output, #(4*16)] + + ghash_acc_1 res, Ht4, Ht34 + + ldp plain_lo, plain_hi, [input, #(3*16)] + aes_full_block aes_st, plain, res, 3 + str res_q, [output, #(3*16)] + + load_htable_56 + ghash_acc_0 res, Ht5, Ht56 + + ldp plain_lo, plain_hi, [input, #(2*16)] + aes_full_block aes_st, plain, res, 2 + str res_q, [output, #(2*16)] + + ghash_acc_1 res, Ht6, Ht56 + + ldp plain_lo, plain_hi, [input, #(1*16)] + aes_full_block aes_st, plain, res, 1 + str res_q, [output, #(1*16)] + + load_htable_78 + ghash_acc_0 res, Ht7, Ht78 + + ldp plain_lo, plain_hi, [input], #(8*16) + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #(8*16) + + ghash_acc_with_tag_1 res, Ht8, Ht78, tag + + ghash_finalize tag + + add ctr, ctr, #UNROLL + + ldp plain_lo, plain_hi, [input, #(7*16)] + aes_full_block aes_st, plain, res, 7 + str res_q, [output, #(7*16)] + + load_htable_12 + ghash_init_0 res, Ht1, Ht12 + + ldp plain_lo, plain_hi, [input, #(6*16)] + aes_full_block aes_st, plain, res, 6 + str res_q, [output, #(6*16)] + + ghash_acc_1 res, Ht2, Ht12 + + ldp plain_lo, plain_hi, [input, #(5*16)] + aes_full_block aes_st, plain, res, 5 + str res_q, [output, #(5*16)] + + load_htable_34 + ghash_acc_0 res, Ht3, Ht34 + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_core_end: + + ldp plain_lo, plain_hi, [input, #(4*16)] + aes_full_block aes_st, plain, res, 4 + str res_q, [output, #(4*16)] + + ghash_acc_1 res, Ht4, Ht34 + + ldp plain_lo, plain_hi, [input, #(3*16)] + aes_full_block aes_st, plain, res, 3 + str res_q, [output, #(3*16)] + + load_htable_56 + ghash_acc_0 res, Ht5, Ht56 + + ldp plain_lo, plain_hi, [input, #(2*16)] + aes_full_block aes_st, plain, res, 2 + str res_q, [output, #(2*16)] + + ghash_acc_1 res, Ht6, Ht56 + + ldp plain_lo, plain_hi, [input, #(1*16)] + aes_full_block aes_st, plain, res, 1 + str res_q, [output, #(1*16)] + + load_htable_78 + ghash_acc_0 res, Ht7, Ht78 + + ldp plain_lo, plain_hi, [input], #(8*16) + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #(8*16) + + ghash_acc_with_tag_1 res, Ht8, Ht78, tag + + ghash_finalize tag // @slothy:late + + add ctr, ctr, #UNROLL + +Lloop_unrolled_end: + + load_htable_12 + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldp plain_lo, plain_hi, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + ghash_init_with_tag_0 res, Ht1, Ht12, tag + + ghash_finalize tag + + add ctr, ctr, #1 + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + // rev32 rtmp_ctr.16b, rtmp_ctr.16b + // str rtmp_ctr_q, [ivec] + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Lenc_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S new file mode 100644 index 00000000000..6d450e4d22b --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S @@ -0,0 +1,548 @@ +// Copyright (c) 2022, ARM Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker +// +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https://github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . +// +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel_slothy_base_192 +.hidden aes_gcm_enc_kernel_slothy_base_192 +.type aes_gcm_enc_kernel_slothy_base_192,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_enc_kernel_slothy_base_192 +.private_extern _aes_gcm_enc_kernel_slothy_base_192 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +aes_st .req v0 +aes_st_q .req q0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk11q .req q15 +rk12q .req q16 +rk13q .req q17 +rk14q .req q2 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +rk11 .req v15 +rk12 .req v16 +rk13 .req v17 +rk14 .req v2 + +plain .req v29 +plain_q .req q29 + +rctr_inc .req v30 +rtmp_ctr .req v31 +rtmp_ctr_q .req q31 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldr rtmp_ctr_q, [ivec] + + // set up counter increment + mov constant_temp, #0x100000000 + movi rctr_inc.16b, #0x0 + fmov rctr_inc.d[1], constant_temp + + rev32 rtmp_ctr.16b, rtmp_ctr.16b +.endm + +// Increase AES counter +.macro aes_ctr_inc + add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s +.endm + +// Increase AES counter and initialize new AES state +.macro next_ctr_init_aes aes_st + rev32 \aes_st\().16b, rtmp_ctr.16b + aes_ctr_inc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold=true + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk11.16b + eor3 \out\().16b, \plain\().16b, rk12.16b, \aes_st\().16b +.endm + +.macro aes_full_block aes_st, input, output + next_ctr_init_aes \aes_st + aesr_0_8 \aes_st\(), rk + aesr_9_10 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 + load_round_key 11 + load_round_key 12 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_enc_kernel_slothy_base_192: +aes_gcm_enc_kernel_slothy_base_192: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Lenc_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Lenc_preamble_end: + + cbz count, Lloop_unrolled_end +Lloop_unrolled_start: + + ldr plain_q, [input], #(4*16) + aes_full_block aes_st, plain, res + str res_q, [output], #(4*16) + + load_htable_34 + ghash_init_1 res, Ht4, Ht34, tag + + ldr plain_q, [input, #(-3*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-3*16)] + + ghash_acc_0 res, Ht3, Ht34 + + ldr plain_q, [input, #(-2*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-2*16)] + + load_htable_12 + ghash_acc_1 res, Ht2, Ht12 + + ldr plain_q, [input, #(-1*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-1*16)] + + ghash_acc_0 res, Ht1, Ht12 + + ghash_finalize tag + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res + str res_q, [output], #16 + ghash_init_0 res, Ht1, Ht12, tag + + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + // Store updated counter + rev32 rtmp_ctr.16b, rtmp_ctr.16b + str rtmp_ctr_q, [ivec] + + restore_vregs + restore_gprs + +Lenc_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S similarity index 78% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S index 1a67fb97175..14274282d61 100644 --- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S +++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S @@ -74,17 +74,11 @@ full_blocks .req x7 remainder .req x9 unroll .req x10 -aes_st0 .req v0 -aes_st0_q .req q0 +aes_st .req v0 +aes_st_q .req q0 -aes_st1 .req v2 -aes_st1_q .req q2 - -res0 .req v0 -res0_q .req q0 - -res1 .req v2 -res1_q .req q2 +res .req v0 +res_q .req q0 ghash_hi .req v9 ghash_lo .req v8 @@ -116,22 +110,6 @@ Ht3 .req Ht1 Ht4 .req Ht2 Ht34 .req Ht12 -Ht5q .req Ht1q -Ht6q .req Ht2q -Ht56q .req Ht12q - -Ht5 .req Ht1 -Ht6 .req Ht2 -Ht56 .req Ht12 - -Ht7q .req Ht1q -Ht8q .req Ht2q -Ht78q .req Ht12q - -Ht7 .req Ht1 -Ht8 .req Ht2 -Ht78 .req Ht12 - rk0q .req q18 rk1q .req q19 rk2q .req q20 @@ -147,7 +125,7 @@ rk10q .req q28 rk11q .req q15 rk12q .req q16 rk13q .req q17 -rk14q .req q1 +rk14q .req q2 rk0 .req v18 rk1 .req v19 @@ -164,7 +142,7 @@ rk10 .req v28 rk11 .req v15 rk12 .req v16 rk13 .req v17 -rk14 .req v1 +rk14 .req v2 plain .req v29 plain_q .req q29 @@ -221,10 +199,14 @@ tag_q .req q11 ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] .endm +// Derive number of iterations of unrolled loop and single-block loop .macro prepare_loop_counts mov unroll, #UNROLL + // Number of AES Blocks (16b each) lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each msub remainder, count, unroll, full_blocks .endm @@ -235,17 +217,20 @@ tag_q .req q11 .macro load_iv ldr rtmp_ctr_q, [ivec] - mov constant_temp, #0x100000000 // set up counter increment + // set up counter increment + mov constant_temp, #0x100000000 movi rctr_inc.16b, #0x0 fmov rctr_inc.d[1], constant_temp rev32 rtmp_ctr.16b, rtmp_ctr.16b .endm +// Increase AES counter .macro aes_ctr_inc add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s .endm +// Increase AES counter and initialize new AES state .macro next_ctr_init_aes aes_st rev32 \aes_st\().16b, rtmp_ctr.16b aes_ctr_inc @@ -295,7 +280,7 @@ tag_q .req q11 next_ctr_init_aes \aes_st aesr_0_8 \aes_st\(), rk aesr_9_10 \aes_st\(), rk - aesr_11_12 \aes_st\(), rk + aesr_11_12 \aes_st\(), rk aesr_final \aes_st, \input, \output .endm @@ -343,22 +328,6 @@ tag_q .req q11 ldr \dst_q, [Htable, #80] .endm -.macro load_h5 dst, dst_q - ldr \dst_q, [Htable, #96] -.endm - -.macro load_h6 dst, dst_q - ldr \dst_q, [Htable, #128] -.endm - -.macro load_h7 dst, dst_q - ldr \dst_q, [Htable, #144] -.endm - -.macro load_h8 dst, dst_q - ldr \dst_q, [Htable, #176] -.endm - .macro load_h12 dst, dst_q ldr \dst_q, [Htable, #16] .endm @@ -367,24 +336,13 @@ tag_q .req q11 ldr \dst_q, [Htable, #64] .endm -.macro load_h56 dst, dst_q - ldr \dst_q, [Htable, #112] -.endm - -.macro load_h78 dst, dst_q - ldr \dst_q, [Htable, #160] -.endm - .macro load_full_htable load_h1 Ht1, Ht1q load_h2 Ht2, Ht2q load_h3 Ht3, Ht3q load_h4 Ht4, Ht4q - load_h5 Ht5, Ht5q - load_h6 Ht6, Ht6q load_h12 Ht12, Ht12q load_h34 Ht34, Ht34q - load_h56 Ht56, Ht56q .endm .macro load_htable_12 @@ -399,68 +357,10 @@ tag_q .req q11 load_h34 Ht34, Ht34q .endm -.macro load_htable_56 - load_h5 Ht5, Ht5q - load_h6 Ht6, Ht6q - load_h56 Ht56, Ht56q -.endm - -.macro load_htable_78 - load_h7 Ht7, Ht7q - load_h8 Ht8, Ht8q - load_h78 Ht78, Ht78q -.endm - /********************************************************************/ /* Macros for GHASH udpate */ /********************************************************************/ -.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - eor \inputa\().16b, \inputa\().16b, tag.16b - - // Low product - pmull ghash_lo.1q, \inputa\().1d, \Ha\().1d - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_hi.1q, \inputa\().2d, \Ha\().2d - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b -.endm - -.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid - rev64 \inputa\().16b, \inputa\().16b - rev64 \inputb\().16b, \inputb\().16b - - // Low product - pmull ghash_tmp.1q, \inputa\().1d, \Ha\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - pmull ghash_tmp.1q, \inputb\().1d, \Hb\().1d - eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b - // High product - pmull2 ghash_tmp.1q, \inputa\().2d, \Ha\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - pmull2 ghash_tmp.1q, \inputb\().2d, \Hb\().2d - eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b - // Middle product - trn1 ghash_tmp.2d, \inputb\().2d, \inputa\().2d - trn2 \inputb\().2d, \inputb\().2d, \inputa\().2d - eor ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b - pmull2 \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d - eor ghash_mid.16b, ghash_mid.16b, \inputa\().16b - pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d - eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b -.endm - .macro ghash_init_0 input, Hk, Hk_mid, tag rev64 \input\().16b, \input\().16b eor \input\().16b, \input\().16b, \tag\().16b @@ -557,7 +457,7 @@ aes_gcm_enc_kernel_slothy_base_256: strb w10, [x9,#2] // kFlag_aes_gcm_enc_kernel #endif - AARCH64_SIGN_LINK_REGISTER + AARCH64_VALID_CALL_TARGET sub sp, sp, #STACK_SIZE Lenc_preamble_start: @@ -578,29 +478,31 @@ Lenc_preamble_end: cbz count, Lloop_unrolled_end Lloop_unrolled_start: - load_round_key 14 - ldr plain_q, [input], #(4*16) - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #(4*16) + aes_full_block aes_st, plain, res + str res_q, [output], #(4*16) + + load_htable_34 + ghash_init_1 res, Ht4, Ht34, tag ldr plain_q, [input, #(-3*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-3*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-3*16)] - load_htable_34 - ghash_init_pair res0, res1, Ht4, Ht3, Ht34 + ghash_acc_0 res, Ht3, Ht34 ldr plain_q, [input, #(-2*16)] - aes_full_block aes_st0, plain, res0 - str res0_q, [output, #(-2*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-2*16)] + + load_htable_12 + ghash_acc_1 res, Ht2, Ht12 ldr plain_q, [input, #(-1*16)] - aes_full_block aes_st1, plain, res1 - str res1_q, [output, #(-1*16)] + aes_full_block aes_st, plain, res + str res_q, [output, #(-1*16)] - load_htable_12 - ghash_acc_pair res0, res1, Ht2, Ht1, Ht12 + ghash_acc_0 res, Ht1, Ht12 ghash_finalize tag @@ -613,12 +515,10 @@ Lloop_unrolled_end: cbz remainder, Lloop_1x_end Lloop_1x_start: - load_round_key 14 - ldr plain_q, [input], #16 - aes_full_block aes_st0, plain, res0 - str res0_q, [output], #16 - ghash_init_0 res0, Ht1, Ht12, tag + aes_full_block aes_st, plain, res + str res_q, [output], #16 + ghash_init_0 res, Ht1, Ht12, tag ghash_finalize tag @@ -641,7 +541,6 @@ Lloop_1x_end: Lenc_postamble_end: add sp, sp, #STACK_SIZE - AARCH64_VALIDATE_LINK_REGISTER ret #endif diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_basic.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_basic.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_basic.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_ilp.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_ilp.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_ilp.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_ilp.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable_rotate.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable_rotate.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable_rotate.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable_rotate.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_full.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_full.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_full.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_full.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_ilp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_ilp_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_ilp_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_ilp_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_late_tag.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_late_tag.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_late_tag.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_basic.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_basic.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_basic.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_basic.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_basic.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_basic.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_dual_acc.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_dual_acc.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_dual_acc.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_reload_round_keys_partial.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_reload_round_keys_partial.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_reload_round_keys_partial.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S similarity index 100% rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S diff --git a/crypto/fipsmodule/modes/asm/slothy/optimize.sh b/crypto/fipsmodule/modes/asm/slothy/optimize.sh index e41823c6a2d..cfbb6dfabae 100755 --- a/crypto/fipsmodule/modes/asm/slothy/optimize.sh +++ b/crypto/fipsmodule/modes/asm/slothy/optimize.sh @@ -24,6 +24,17 @@ if [ "$SZ" = "" ]; then echo "No keysize specified -- defaulting to 128 bit" fi +if [ "$ENC" = "" ]; then + echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)." + ENC=1 +fi + +if [ "$ENC" = "1" ]; then + ENCDEC="enc" +else + ENCDEC="dec" +fi + if [ "$AWS_LC_BASE" = "" ]; then # Oof... bit gross AWS_LC_BASE=$(dirname $(dirname $(dirname $(dirname $(dirname $(pwd)))))) @@ -32,9 +43,9 @@ fi BUILD_DIR=build_release -CLEAN_STEM=aesv8-gcm-armv8-base-${SZ} -OPT_STEM=aesv8-gcm-armv8-opt-${SZ} -TMP_STEM=aesv8-gcm-armv8-tmp-${SZ} +CLEAN_STEM=aesv8-gcm-armv8-${ENCDEC}-base-${SZ} +OPT_STEM=aesv8-gcm-armv8-${ENCDEC}-opt-${SZ} +TMP_STEM=aesv8-gcm-armv8-${ENCDEC}-tmp-${SZ} UARCH=${UARCH:=N1} if [ $UARCH = "N1" ]; then @@ -46,46 +57,23 @@ else exit 1 fi -if [ $SZ = "128" ]; then - VARIANTS_ALL=" - x4_basic - x4_late_tag - x4_ilp - x4_dual_acc - x4_dual_acc_keep_htable - x4_keep_htable - x4_keep_htable_rotate - x4_reload_round_keys_partial - x4_reload_round_keys_full - x4_scalar_iv - x4_scalar_iv_mem - x4_scalar_iv_mem_late_tag - x4_scalar_iv_mem_late_tag_keep_htable - x6_basic - x8_basic - x6_ilp - x8_ilp - x8_ilp_dual_acc - x8_ilp_rotate - x8_ilp_rotate_dual_acc - x8_ilp_rotate_manual_eor3 - x8_reload - x8_reload_ldp_stp - x8_reload_ldp_stp_dual_acc - x8_reload_ldp_stp_simpler - x8_reload_ldp_stp_simpler_manual_rotate - " -elif [ $SZ = "192" ]; then - VARIANTS_ALL=" - x4_basic - x4_reload_round_keys_partial - " -else - VARIANTS_ALL=" - x4_basic - x4_reload_round_keys_partial - " -fi +list_variants() { + SZ=$1 + UNROLL=$2 + DIR=$3 + VARIANTS=$((ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ') 2>/dev/null || echo "") + echo $VARIANTS +} + +VARIANTS_ALL="" +for UNROLL in x4 x6 x8 +do + for V in $(list_variants $SZ $UNROLL "clean/${ENCDEC}"); + do + VARIANTS_ALL="$VARIANTS_ALL + ${UNROLL}_$V" + done +done VERBOSE=${VERBOSE:=0} TIMEOUT=${TIMEOUT:=1200} # 20min timeout by default @@ -181,8 +169,8 @@ optimize_generic() { optimize_variant() { echo "Optimizing variant $1 ..." - INFILE=$CLEAN_DIR/${CLEAN_STEM}_$1.S - OUTFILE=$OPT_DIR/${OPT_STEM}_$1.S + INFILE=$CLEAN_DIR/${ENCDEC}/${CLEAN_STEM}_$1.S + OUTFILE=$OPT_DIR/${ENCDEC}/${OPT_STEM}_$1.S TMP0=$TMP_DIR/${TMP_STEM}_$1_0.S TMP1=$TMP_DIR/${TMP_STEM}_$1_1.S TMP2=$TMP_DIR/${TMP_STEM}_$1_2.S diff --git a/crypto/fipsmodule/modes/asm/slothy/stats.sh b/crypto/fipsmodule/modes/asm/slothy/stats.sh index b283b6d26fb..27b2b5b5d98 100755 --- a/crypto/fipsmodule/modes/asm/slothy/stats.sh +++ b/crypto/fipsmodule/modes/asm/slothy/stats.sh @@ -36,13 +36,25 @@ if [ "$UNROLL_ARG" = "" ]; then echo "No unrolling level specified UNROLL_ARG -- defaulting to ${UNROLL_ARG}" fi +if [ "$ENC" = "" ]; then + echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)." + ENC=1 +fi + +if [ "$ENC" = "1" ]; then + ENCDEC="enc" +else + ENCDEC="dec" +fi + + LOOP_LABEL="Lloop_unrolled_start:" list_variants() { SZ=$1 UNROLL=$2 DIR=$3 - VARIANTS=$(ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ' ) + VARIANTS=$(ls -1 ./${DIR}/${ENCDEC}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ' ) echo $VARIANTS } @@ -53,7 +65,7 @@ get_benchmark_for() { UNROLL=$2 DIR=$3 VARIANT=$4 - ID="${DIR}/${SZ}_${UNROLL}_${VARIANT}" + ID="${DIR}/${ENCDEC}/${SZ}_${UNROLL}_${VARIANT}" cat $BENCHMARKS \ | grep "Testing variant: ${ID}" -A 10 \ | grep "MB/s" \ @@ -78,8 +90,8 @@ get_slothy_stats_for() { else TY="base" fi - FILEBASE="aesv8-gcm-armv8" - FILE="${DIR}/${FILEBASE}-${TY}-${SZ}_${UNROLL}_${VARIANT}.S" + FILEBASE="aesv8-gcm-armv8-enc" + FILE="${DIR}/${ENCDEC}/${FILEBASE}-${TY}-${SZ}_${UNROLL}_${VARIANT}.S" cat $FILE \ | grep "${LOOP_LABEL}" -A 10 \ | sed -n 's/[^0-9]*\([0-9][0-9.]*\).*/\1/p' \ @@ -91,7 +103,7 @@ get_stats_for() { UNROLL=$2 DIR=$3 VARIANT=$4 - ID="${DIR}/${SZ}_${UNROLL}_${VARIANT}" + ID="${DIR}/${ENCDEC}/${SZ}_${UNROLL}_${VARIANT}" BENCH=$(get_benchmark_for $1 $2 $3 $4) if [ "$DIR" = "opt" ]; then SLOTHY=$(get_slothy_stats_for $1 $2 $3 $4) diff --git a/crypto/fipsmodule/modes/asm/slothy/test.sh b/crypto/fipsmodule/modes/asm/slothy/test.sh index fa5408e9f14..fce132b022a 100755 --- a/crypto/fipsmodule/modes/asm/slothy/test.sh +++ b/crypto/fipsmodule/modes/asm/slothy/test.sh @@ -3,7 +3,7 @@ # Build and test AES-GCM variants # # Usage: -# > [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant] +# > [ENC=0/1] [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant] # # This script tests that the assembly files in clean/ or opt/ can be used as drop-in # replacements for the default aesv8-gcm-armv8-base-{128,192,256} @@ -34,6 +34,17 @@ else OPT_STR="opt" fi +if [ "$ENC" = "" ]; then + echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)." + ENC=1 +fi + +if [ "$ENC" = "1" ]; then + ENCDEC="enc" +else + ENCDEC="dec" +fi + if [ "$VERBOSE" = "" ]; then VERBOSE=0 echo "Environment variable VERBOSE not set. Defaulting to VERBOSE=0 (silent mode)." @@ -48,16 +59,17 @@ TIMEOUT=5 # Run tests for 5 seconds -- they often hang upon a bug KEEP_GOING=${KEEP_GOING:=0} ASM_DIR=../ -AES_SLOTHY_ASM=aesv8-gcm-armv8-slothy-${SZ}.S if [ "$OPT" = "0" ]; then - DIR=./clean - FILE_STEM=aesv8-gcm-armv8-base-${SZ} + DIR=./clean/${ENCDEC} + FILE_STEM=aesv8-gcm-armv8-${ENCDEC}-base-${SZ} else - DIR=./opt - FILE_STEM=aesv8-gcm-armv8-opt-${SZ} + DIR=./opt/${ENCDEC} + FILE_STEM=aesv8-gcm-armv8-${ENCDEC}-opt-${SZ} fi +AES_SLOTHY_ASM=aesv8-gcm-armv8-${ENCDEC}-slothy-${SZ}.S + set_variant() { cp $DIR/${FILE_STEM}_$1.S $ASM_DIR/$AES_SLOTHY_ASM } @@ -84,7 +96,7 @@ bench_variant() { } do_variant() { - echo "* Testing variant: ${OPT_STR}/${SZ}_$1" + echo "* Testing variant: ${OPT_STR}/${ENCDEC}/${SZ}_$1" printf " - Copy... " set_variant $1 printf "OK!\n" @@ -115,50 +127,26 @@ do_variant() { fi } +list_variants() { + SZ=$1 + UNROLL=$2 + DIR=$3 + VARIANTS=$((ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ') 2>/dev/null || echo "") + echo $VARIANTS +} -if [ $SZ = "128" ]; then - VARIANTS=" - x4_basic - x4_late_tag - x4_ilp - x4_dual_acc - x4_dual_acc_keep_htable - x4_keep_htable - x4_keep_htable_rotate - x4_reload_round_keys_partial - x4_reload_round_keys_full - x4_scalar_iv - x4_scalar_iv_mem - x4_scalar_iv_mem_late_tag - x4_scalar_iv_mem_late_tag_keep_htable - x6_basic - x8_basic - x6_ilp - x8_ilp - x8_ilp_dual_acc - x8_ilp_rotate - x8_ilp_rotate_dual_acc - x8_ilp_rotate_manual_eor3 - x8_reload - x8_reload_ldp_stp - x8_reload_ldp_stp_dual_acc - x8_reload_ldp_stp_simpler - x8_reload_ldp_stp_simpler_manual_rotate - " -elif [ $SZ = "192" ]; then - VARIANTS=" - x4_basic - x4_reload_round_keys_partial - " -else - VARIANTS=" - x4_basic - x4_reload_round_keys_partial - " -fi +VARIANTS="" +for UNROLL in x4 x6 x8 +do + for V in $(list_variants $SZ $UNROLL $DIR); + do + VARIANTS="$VARIANTS + ${UNROLL}_$V" + done +done if [ "$1" = "--help" ]; then - echo "Usage: [VERBOSE=0/1] [OPT=0/1] test.sh [variant]" + echo "Usage: [ENC=0/1] [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant]" echo "Valid values for 'variant' are:" for var in $VARIANTS; do echo "* $var" diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c index 64717bf4cfe..756aa995633 100644 --- a/crypto/fipsmodule/modes/gcm.c +++ b/crypto/fipsmodule/modes/gcm.c @@ -161,9 +161,9 @@ static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, // in the case of the EVP API. // In the case of the AEAD API, it can be used for all input lengths // but we are not identifying which API calls the code below. - #define USE_SLOTHY_AES_GCM_128 1 + #define USE_SLOTHY_AES_GCM_ENC_128 -#if defined(USE_SLOTHY_AES_GCM_128) +#if defined(USE_SLOTHY_AES_GCM_ENC_128) if (key->rounds == 10) { aes_gcm_enc_kernel_slothy_base_128(in, len_blocks * 8, out, Xi, ivec, key, Htable); } @@ -212,6 +212,20 @@ static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, // in the case of the EVP API. // In the case of the AEAD API, it can be used for all input lengths // but we are not identifying which API calls the code below. + #define USE_SLOTHY_AES_GCM_DEC_128 + +#if defined(USE_SLOTHY_AES_GCM_DEC_128) + if (key->rounds == 10) { + aes_gcm_dec_kernel_slothy_base_128(in, len_blocks * 8, out, Xi, ivec, key, Htable); + } + else if (key->rounds == 12) { + aes_gcm_dec_kernel_slothy_base_192(in, len_blocks * 8, out, Xi, ivec, key, Htable); + } + else if (key->rounds == 14) { + aes_gcm_dec_kernel_slothy_base_256(in, len_blocks * 8, out, Xi, ivec, key, Htable); + } + else +#endif if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) { switch(key->rounds) { case 10: diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h index 0f91fc335db..2380376d072 100644 --- a/crypto/fipsmodule/modes/internal.h +++ b/crypto/fipsmodule/modes/internal.h @@ -371,6 +371,19 @@ void aes_gcm_enc_kernel_slothy_base_256(const uint8_t *in, uint64_t in_bits, voi void *Xi, uint8_t *ivec, const AES_KEY *key, const u128 Htable[16]); +void aes_gcm_dec_kernel_slothy_base_128(const uint8_t *in, uint64_t in_bits, void *out, + void *Xi, uint8_t *ivec, const AES_KEY *key, + const u128 Htable[16]); + +void aes_gcm_dec_kernel_slothy_base_192(const uint8_t *in, uint64_t in_bits, void *out, + void *Xi, uint8_t *ivec, const AES_KEY *key, + const u128 Htable[16]); + +void aes_gcm_dec_kernel_slothy_base_256(const uint8_t *in, uint64_t in_bits, void *out, + void *Xi, uint8_t *ivec, const AES_KEY *key, + const u128 Htable[16]); + + // These functions are defined in aesv8-gcm-armv8-unroll8.pl. // They take input length in BITS and return number of BYTES processed. size_t aesv8_gcm_8x_enc_128(const uint8_t *in, size_t bit_len, uint8_t *out,