From 9cf92ce4c3af6532b7dec8c5f03c75e277d75f71 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Tue, 9 Jul 2024 15:03:41 +0100 Subject: [PATCH] Add SLOTHY-optimized AES-GCM-128 decryption kernels --- .../aesv8-gcm-armv8-dec-opt-128_x4_basic.S | 1080 +++++++++++ ...sv8-gcm-armv8-dec-opt-128_x4_keep_htable.S | 1046 +++++++++++ ...-gcm-armv8-dec-opt-128_x4_scalar_iv_mem2.S | 1517 ++++++++++++++++ ...v8-dec-opt-128_x4_scalar_iv_mem_late_tag.S | 1602 +++++++++++++++++ ...28_x4_scalar_iv_mem_late_tag_keep_htable.S | 1545 ++++++++++++++++ 5 files changed, 6790 insertions(+) create mode 100644 crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_basic.S create mode 100644 crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_keep_htable.S create mode 100644 crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem2.S create mode 100644 crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag.S create mode 100644 crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_basic.S new file mode 100644 index 00000000000..2be7cea2abf --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_basic.S @@ -0,0 +1,1080 @@ +// Copyright (c) 2022, ARM Inc. + +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. + +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker + +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https: // github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . + +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https: // github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +aes_st .req v0 +aes_st_q .req q0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +rctr_inc .req v30 +rtmp_ctr .req v31 +rtmp_ctr_q .req q31 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldr rtmp_ctr_q, [ivec] + + // set up counter increment + mov constant_temp, #0x100000000 + movi rctr_inc.16b, #0x0 + fmov rctr_inc.d[1], constant_temp + + rev32 rtmp_ctr.16b, rtmp_ctr.16b +.endm + +// Increase AES counter +.macro aes_ctr_inc + add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s +.endm + +// Increase AES counter and initialize new AES state +.macro next_ctr_init_aes aes_st + rev32 \aes_st\().16b, rtmp_ctr.16b + aes_ctr_inc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output + next_ctr_init_aes \aes_st + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end + // Instructions: 22 + // Expected cycles: 11 + // Expected IPC: 2.00 + // + // Cycle bound: 11.0 + // IPC bound: 2.00 + // + // Wall time: 0.13s + // User time: 0.13s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q5, [x0], #(4*16) // *............................. + ldr q0, [x0, #-48] // *............................. + ldr q16, [x0, #-32] // .*............................ + ldr q4, [x0, #-16] // .*............................ + ldr q15, [x6, #48] // ..*........................... + ldr q2, [x6, #32] // ..*........................... + ldr q8, [x6, #80] // ...*.......................... + ldr q9, [x6] // ...*.......................... + rev64 v1.16B, v5.16B // ....*......................... + rev64 v12.16B, v0.16B // ....*......................... + rev64 v10.16B, v16.16B // .....*........................ + rev64 v13.16B, v4.16B // .....*........................ + eor v3.16B, v1.16B, v11.16B // ......*....................... + pmull v17.1q, v12.1d, v15.1d // ......*....................... + pmull2 v29.1q, v12.2d, v15.2d // .......*...................... + mov d15, v12.d[1] // .......*...................... + pmull2 v1.1q, v3.2d, v8.2d // ........*..................... + ldr q6, [x6, #64] // ........*..................... + eor v15.8B, v15.8B, v12.8B // .........*.................... + pmull v14.1q, v10.1d, v2.1d // .........*.................... + eor v12.16B, v1.16B, v29.16B // ..........*................... + pmull2 v29.1q, v10.2d, v2.2d // ..........*................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q8, [x6, #80] // ...*........................... + // ldr q14, [x6, #32] // ..*............................ + // ldr q5, [x0], #(4*16) // *.............................. + // ldr q1, [x6, #48] // ..*............................ + // rev64 v13.16B, v5.16B // ....*.......................... + // ldr q0, [x0, #-48] // *.............................. + // rev64 v2.16B, v0.16B // ....*.......................... + // mov d10, v2.d[1] // .......*....................... + // ldr q6, [x6, #64] // ........*...................... + // pmull v17.1q, v2.1d, v1.1d // ......*........................ + // eor v3.16B, v13.16B, v11.16B // ......*........................ + // ldr q16, [x0, #-32] // .*............................. + // pmull2 v11.1q, v2.2d, v1.2d // .......*....................... + // pmull2 v13.1q, v3.2d, v8.2d // ........*...................... + // ldr q4, [x0, #-16] // .*............................. + // eor v15.8B, v10.8B, v2.8B // .........*..................... + // rev64 v10.16B, v16.16B // .....*......................... + // ldr q9, [x6] // ...*........................... + // eor v12.16B, v13.16B, v11.16B // ..........*.................... + // pmull2 v29.1q, v10.2d, v14.2d // ..........*.................... + // rev64 v13.16B, v4.16B // .....*......................... + // pmull v14.1q, v10.1d, v14.1d // .........*..................... + + sub count, count, #1 +cbz count, Lloop_unrolled_start_postamble +Lloop_unrolled_start: + // Instructions: 115 + // Expected cycles: 58 + // Expected IPC: 1.98 + // + // Cycle bound: 54.0 + // IPC bound: 2.13 + // + // Wall time: 3601.75s + // User time: 3601.75s + // + // ------------------- cycle (expected) --------------------> + // 0 25 50 + // |------------------------|------------------------|------- + pmull v1.1q, v3.1d, v8.1d // *......................................................... + eor v2.16B, v12.16B, v29.16B // *......................................................... + rev32 v12.16B, v31.16B // .*........................................................ + pmull v8.1q, v13.1d, v9.1d // .*........................................................ + pmull2 v29.1q, v13.2d, v9.2d // ..*....................................................... + add v9.4S, v31.4S, v30.4S // ..*....................................................... + aesr v12.16b, v18.16b // ...*...................................................... + eor v11.16B, v1.16B, v17.16B // ...*...................................................... + pmull v1.1q, v15.1d, v6.1d // ....*..................................................... + rev32 v31.16B, v9.16B // ....*..................................................... + aesr v12.16b, v19.16b // .....*.................................................... + ext v15.16B, v3.16B, v3.16B, #8 // .....*.................................................... + eor v14.16B, v11.16B, v14.16B // ......*................................................... + aesr v31.16b, v18.16b // ......*................................................... + eor v3.16B, v15.16B, v3.16B // .......*.................................................. + aesr v12.16b, v20.16b // .......*.................................................. + add v11.4S, v9.4S, v30.4S // ........*................................................. + aesr v31.16b, v19.16b // ........*................................................. + eor v29.16B, v2.16B, v29.16B // .........*................................................ + pmull2 v15.1q, v3.2d, v6.2d // .........*................................................ + aesr v12.16b, v21.16b // ..........*............................................... + mov d3, v13.d[1] // ..........*............................................... + eor v1.16B, v15.16B, v1.16B // ...........*.............................................. + aesr v31.16b, v20.16b // ...........*.............................................. + aesr v12.16b, v22.16b // ............*............................................. + ext v9.16B, v10.16B, v10.16B, #8 // ............*............................................. + aesr v31.16b, v21.16b // .............*............................................ + eor v3.8B, v3.8B, v13.8B // .............*............................................ + eor v10.16B, v9.16B, v10.16B // ..............*........................................... + aesr v12.16b, v23.16b // ..............*........................................... + rev32 v15.16B, v11.16B // ...............*.......................................... + aesr v31.16b, v22.16b // ...............*.......................................... + aesr v12.16b, v24.16b // ................*......................................... + add v11.4S, v11.4S, v30.4S // ................*......................................... + eor v17.16B, v14.16B, v8.16B // .................*........................................ + aesr v31.16b, v23.16b // .................*........................................ + aesr v12.16b, v25.16b // ..................*....................................... + rev32 v9.16B, v11.16B // ..................*....................................... + aesr v31.16b, v24.16b // ...................*...................................... + ldr q13, [x6, #16] // ...................*...................................... + aesr v12.16b, v26.16b // ....................*..................................... + ldr q8, [x6, #80] // ....................e..................................... + ldr q14, [x6, #32] // .....................e.................................... + aesr v31.16b, v25.16b // .....................*.................................... + aese v12.16b, v27.16b // ......................*................................... + pmull2 v6.1q, v10.2d, v13.2d // .......................*.................................. + eor v10.16B, v17.16B, v29.16B // .......................*.................................. + eor v12.16B, v12.16B, v28.16B // ........................*................................. + aesr v15.16b, v18.16b // ........................*................................. + eor v2.16B, v1.16B, v6.16B // .........................*................................ + aesr v31.16b, v26.16b // .........................*................................ + eor v1.16B, v12.16B, v5.16B // ..........................*............................... + pmull v12.1q, v29.1d, v7.1d // ..........................*............................... + ext v6.16B, v29.16B, v29.16B, #8 // ...........................*.............................. + aesr v9.16b, v18.16b // ...........................*.............................. + ldr q5, [x0], #(4*16) // ............................e............................. + aesr v15.16b, v19.16b // ............................*............................. + aesr v9.16b, v19.16b // .............................*............................ + str q1, [x2], #(4*16) // .............................*............................ + aese v31.16b, v27.16b // ..............................*........................... + ldr q1, [x6, #48] // ..............................e........................... + pmull v29.1q, v3.1d, v13.1d // ...............................*.......................... + eor v3.16B, v6.16B, v12.16B // ...............................*.......................... + rev64 v13.16B, v5.16B // ................................e......................... + aesr v15.16b, v20.16b // ................................*......................... + eor v6.16B, v31.16B, v28.16B // .................................*........................ + aesr v9.16b, v20.16b // .................................*........................ + eor v12.16B, v2.16B, v29.16B // ..................................*....................... + aesr v15.16b, v21.16b // ..................................*....................... + eor v29.16B, v6.16B, v0.16B // ...................................*...................... + aesr v9.16b, v21.16b // ...................................*...................... + ldr q0, [x0, #-48] // ....................................e..................... + aesr v15.16b, v22.16b // ....................................*..................... + eor v12.16B, v12.16B, v10.16B // .....................................*.................... + aesr v9.16b, v22.16b // .....................................*.................... + add v31.4S, v11.4S, v30.4S // ......................................*................... + aesr v15.16b, v23.16b // ......................................*................... + eor v3.16B, v12.16B, v3.16B // .......................................*.................. + aesr v9.16b, v23.16b // .......................................*.................. + aesr v15.16b, v24.16b // ........................................*................. + rev64 v2.16B, v0.16B // ........................................e................. + ext v11.16B, v3.16B, v3.16B, #8 // .........................................*................ + pmull v3.1q, v3.1d, v7.1d // .........................................*................ + mov d10, v2.d[1] // ..........................................e............... + aesr v15.16b, v25.16b // ..........................................*............... + eor v17.16B, v17.16B, v3.16B // ...........................................*.............. + aesr v9.16b, v24.16b // ...........................................*.............. + str q29, [x2, #-48] // ............................................*............. + aesr v15.16b, v26.16b // ............................................*............. + eor v11.16B, v17.16B, v11.16B // .............................................*............ + aesr v9.16b, v25.16b // .............................................*............ + aese v15.16b, v27.16b // ..............................................*........... + ldr q6, [x6, #64] // ..............................................e........... + ext v11.16B, v11.16B, v11.16B, #8 // ...............................................*.......... + aesr v9.16b, v26.16b // ...............................................*.......... + pmull v17.1q, v2.1d, v1.1d // ................................................e......... + eor v15.16B, v15.16B, v28.16B // ................................................*......... + eor v3.16B, v13.16B, v11.16B // .................................................e........ + aese v9.16b, v27.16b // .................................................*........ + eor v15.16B, v15.16B, v16.16B // ..................................................*....... + ldr q16, [x0, #-32] // ..................................................e....... + pmull2 v11.1q, v2.2d, v1.2d // ...................................................e...... + eor v12.16B, v9.16B, v28.16B // ...................................................*...... + str q15, [x2, #-32] // ....................................................*..... + pmull2 v13.1q, v3.2d, v8.2d // ....................................................e..... + eor v12.16B, v12.16B, v4.16B // .....................................................*.... + ldr q4, [x0, #-16] // .....................................................e.... + eor v15.8B, v10.8B, v2.8B // ......................................................e... + rev64 v10.16B, v16.16B // ......................................................e... + str q12, [x2, #-16] // .......................................................*.. + ldr q9, [x6] // .......................................................e.. + eor v12.16B, v13.16B, v11.16B // ........................................................e. + pmull2 v29.1q, v10.2d, v14.2d // ........................................................e. + rev64 v13.16B, v4.16B // .........................................................e + pmull v14.1q, v10.1d, v14.1d // .........................................................e + + // ------------------------------------- cycle (expected) --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------ + // ldr q29, [x0], #(4*16) // ........e.............................'...........................~........................... + // rev32 v0.16b, v31.16b // ......................................'*...................................................... + // add v31.4s, v31.4s, v30.4s // ......................................'.*..................................................... + // aesr v0.16b, v18.16b // ......................................'..*.................................................... + // aesr v0.16b, v19.16b // ......................................'....*.................................................. + // aesr v0.16b, v20.16b // ......................................'......*................................................ + // aesr v0.16b, v21.16b // ......................................'.........*............................................. + // aesr v0.16b, v22.16b // ......................................'...........*........................................... + // aesr v0.16b, v23.16b // ......................................'.............*......................................... + // aesr v0.16b, v24.16b // ......................................'...............*....................................... + // aesr v0.16b, v25.16b // ......................................'.................*..................................... + // aesr v0.16b, v26.16b // ~.....................................'...................*................................... + // aese v0.16b, v27.16b // ..~...................................'.....................*................................. + // eor v0.16b, v0.16b, v28.16b // ....~.................................'.......................*............................... + // eor v0.16b, v0.16b, v29.16b // ......~...............................'.........................*............................. + // str q0, [x2], #(4*16) // .........~............................'............................*.......................... + // ldr q12, [x6, #48] // ..........e...........................'.............................~......................... + // ldr q13, [x6, #80] // e.....................................'...................~................................... + // ldr q14, [x6, #64] // ..........................e...........'.............................................~......... + // rev64 v29.16b, v29.16b // ............e.........................'...............................~....................... + // eor v29.16b, v29.16b, v11.16b // .............................e........'................................................~...... + // pmull v8.1q, v29.1d, v13.1d // ......................................*....................................................... + // pmull2 v9.1q, v29.2d, v13.2d // ................................e.....'...................................................~... + // ext v11.16b, v29.16b, v29.16b, #8 // ......................................'....*.................................................. + // eor v11.16b, v11.16b, v29.16b // ......................................'......*................................................ + // pmull2 v10.1q, v11.2d, v14.2d // ......................................'........*.............................................. + // ldr q29, [x0, #(-3*16)] // ................e.....................'...................................~................... + // rev32 v0.16b, v31.16b // ......................................'...*................................................... + // add v31.4s, v31.4s, v30.4s // ......................................'.......*............................................... + // aesr v0.16b, v18.16b // ......................................'.....*................................................. + // aesr v0.16b, v19.16b // ......................................'.......*............................................... + // aesr v0.16b, v20.16b // ......................................'..........*............................................ + // aesr v0.16b, v21.16b // ......................................'............*.......................................... + // aesr v0.16b, v22.16b // ......................................'..............*........................................ + // aesr v0.16b, v23.16b // ......................................'................*...................................... + // aesr v0.16b, v24.16b // ......................................'..................*.................................... + // aesr v0.16b, v25.16b // .~....................................'....................*.................................. + // aesr v0.16b, v26.16b // .....~................................'........................*.............................. + // aese v0.16b, v27.16b // ..........~...........................'.............................*......................... + // eor v0.16b, v0.16b, v28.16b // .............~........................'................................*...................... + // eor v0.16b, v0.16b, v29.16b // ...............~......................'..................................*.................... + // str q0, [x2, #(-3*16)] // ........................~.............'...........................................*........... + // rev64 v29.16b, v29.16b // ....................e.................'.......................................~............... + // pmull v11.1q, v29.1d, v12.1d // ............................e.........'...............................................~....... + // eor v8.16b, v8.16b, v11.16b // ......................................'..*.................................................... + // pmull2 v11.1q, v29.2d, v12.2d // ...............................e......'..................................................~.... + // eor v9.16b, v9.16b, v11.16b // ....................................e.'....................................................... + // mov d11, v29.d[1] // ......................e...............'.........................................~............. + // eor v11.8b, v11.8b, v29.8b // ..................................e...'.....................................................~. + // pmull v11.1q, v11.1d, v14.1d // ......................................'...*................................................... + // eor v10.16b, v10.16b, v11.16b // ......................................'..........*............................................ + // ldr q29, [x0, #(-2*16)] // ..............................e.......'.................................................~..... + // rev32 v0.16b, v31.16b // ......................................'..............*........................................ + // add v31.4s, v31.4s, v30.4s // ......................................'...............*....................................... + // aesr v0.16b, v18.16b // ....~.................................'.......................*............................... + // aesr v0.16b, v19.16b // ........~.............................'...........................*........................... + // aesr v0.16b, v20.16b // ............~.........................'...............................*....................... + // aesr v0.16b, v21.16b // ..............~.......................'.................................*..................... + // aesr v0.16b, v22.16b // ................~.....................'...................................*................... + // aesr v0.16b, v23.16b // ..................~...................'.....................................*................. + // aesr v0.16b, v24.16b // ....................~.................'.......................................*............... + // aesr v0.16b, v25.16b // ......................~...............'.........................................*............. + // aesr v0.16b, v26.16b // ........................~.............'...........................................*........... + // aese v0.16b, v27.16b // ..........................~...........'.............................................*......... + // eor v0.16b, v0.16b, v28.16b // ............................~.........'...............................................*....... + // eor v0.16b, v0.16b, v29.16b // ..............................~.......'.................................................*..... + // str q0, [x2, #(-2*16)] // ................................~.....'...................................................*... + // ldr q12, [x6] // ...................................e..'....................................................... + // ldr q13, [x6, #32] // .e....................................'....................~.................................. + // ldr q14, [x6, #16] // ......................................'..................*.................................... + // rev64 v29.16b, v29.16b // ..................................e...'.....................................................~. + // pmull v11.1q, v29.1d, v13.1d // .....................................e'....................................................... + // eor v8.16b, v8.16b, v11.16b // ......................................'.....*................................................. + // pmull2 v11.1q, v29.2d, v13.2d // ....................................e.'....................................................... + // eor v9.16b, v9.16b, v11.16b // ......................................*....................................................... + // ext v11.16b, v29.16b, v29.16b, #8 // ......................................'...........*........................................... + // eor v11.16b, v11.16b, v29.16b // ......................................'.............*......................................... + // pmull2 v11.1q, v11.2d, v14.2d // ...~..................................'......................*................................ + // eor v10.16b, v10.16b, v11.16b // .....~................................'........................*.............................. + // ldr q29, [x0, #(-1*16)] // .................................e....'....................................................~.. + // rev32 v0.16b, v31.16b // ......................................'.................*..................................... + // add v31.4s, v31.4s, v30.4s // ..................~...................'.....................................*................. + // aesr v0.16b, v18.16b // .......~..............................'..........................*............................ + // aesr v0.16b, v19.16b // .........~............................'............................*.......................... + // aesr v0.16b, v20.16b // .............~........................'................................*...................... + // aesr v0.16b, v21.16b // ...............~......................'..................................*.................... + // aesr v0.16b, v22.16b // .................~....................'....................................*.................. + // aesr v0.16b, v23.16b // ...................~..................'......................................*................ + // aesr v0.16b, v24.16b // .......................~..............'..........................................*............ + // aesr v0.16b, v25.16b // .........................~............'............................................*.......... + // aesr v0.16b, v26.16b // ...........................~..........'..............................................*........ + // aese v0.16b, v27.16b // .............................~........'................................................*...... + // eor v0.16b, v0.16b, v28.16b // ...............................~......'..................................................*.... + // eor v0.16b, v0.16b, v29.16b // .................................~....'....................................................*.. + // str q0, [x2, #(-1*16)] // ...................................~..'......................................................* + // rev64 v29.16b, v29.16b // .....................................e'....................................................... + // pmull v11.1q, v29.1d, v12.1d // ......................................'*...................................................... + // eor v8.16b, v8.16b, v11.16b // ......................................'................*...................................... + // pmull2 v11.1q, v29.2d, v12.2d // ......................................'.*..................................................... + // eor v9.16b, v9.16b, v11.16b // ......................................'........*.............................................. + // mov d11, v29.d[1] // ......................................'.........*............................................. + // eor v11.8b, v11.8b, v29.8b // ......................................'............*.......................................... + // pmull v11.1q, v11.1d, v14.1d // ...........~..........................'..............................*........................ + // eor v10.16b, v10.16b, v11.16b // ..............~.......................'.................................*..................... + // eor v0.16b, v8.16b, v9.16b // ...~..................................'......................*................................ + // pmull v1.1q, v9.1d, v7.1d // ......~...............................'.........................*............................. + // ext v9.16b, v9.16b, v9.16b, #8 // .......~..............................'..........................*............................ + // eor v10.16b, v10.16b, v0.16b // .................~....................'....................................*.................. + // eor v1.16b, v9.16b, v1.16b // ...........~..........................'..............................*........................ + // eor v10.16b, v10.16b, v1.16b // ...................~..................'......................................*................ + // pmull v9.1q, v10.1d, v7.1d // .....................~................'........................................*.............. + // eor v8.16b, v8.16b, v9.16b // .......................~..............'..........................................*............ + // ext v10.16b, v10.16b, v10.16b, #8 // .....................~................'........................................*.............. + // eor v11.16b, v8.16b, v10.16b // .........................~............'............................................*.......... + // ext v11.16b, v11.16b, v11.16b, #8 // ...........................~..........'..............................................*........ + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_start_postamble:// end of loop kernel + // Instructions: 93 + // Expected cycles: 55 + // Expected IPC: 1.69 + // + // Cycle bound: 55.0 + // IPC bound: 1.69 + // + // Wall time: 2.91s + // User time: 2.91s + // + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + pmull2 v1.1q, v13.2d, v9.2d // *...................................................... + eor v29.16B, v12.16B, v29.16B // *...................................................... + rev32 v2.16B, v31.16B // .*..................................................... + pmull v8.1q, v3.1d, v8.1d // .*..................................................... + pmull v12.1q, v15.1d, v6.1d // ..*.................................................... + add v15.4S, v31.4S, v30.4S // ..*.................................................... + aesr v2.16b, v18.16b // ...*................................................... + eor v8.16B, v8.16B, v17.16B // ...*................................................... + rev32 v31.16B, v15.16B // ....*.................................................. + pmull v17.1q, v13.1d, v9.1d // ....*.................................................. + ext v11.16B, v3.16B, v3.16B, #8 // .....*................................................. + aesr v2.16b, v19.16b // .....*................................................. + eor v8.16B, v8.16B, v14.16B // ......*................................................ + aesr v31.16b, v18.16b // ......*................................................ + eor v9.16B, v11.16B, v3.16B // .......*............................................... + aesr v2.16b, v20.16b // .......*............................................... + aesr v31.16b, v19.16b // ........*.............................................. + add v11.4S, v15.4S, v30.4S // ........*.............................................. + eor v29.16B, v29.16B, v1.16B // .........*............................................. + pmull2 v9.1q, v9.2d, v6.2d // .........*............................................. + mov d6, v13.d[1] // ..........*............................................ + aesr v31.16b, v20.16b // ..........*............................................ + pmull v14.1q, v29.1d, v7.1d // ...........*........................................... + eor v3.16B, v9.16B, v12.16B // ...........*........................................... + aesr v31.16b, v21.16b // ............*.......................................... + ext v9.16B, v10.16B, v10.16B, #8 // ............*.......................................... + ldr q12, [x6, #16] // .............*......................................... + aesr v2.16b, v21.16b // .............*......................................... + aesr v31.16b, v22.16b // ..............*........................................ + eor v13.8B, v6.8B, v13.8B // ...............*....................................... + aesr v2.16b, v22.16b // ...............*....................................... + aesr v31.16b, v23.16b // ................*...................................... + eor v9.16B, v9.16B, v10.16B // ................*...................................... + rev32 v6.16B, v11.16B // .................*..................................... + pmull v15.1q, v13.1d, v12.1d // .................*..................................... + add v13.4S, v11.4S, v30.4S // ..................*.................................... + aesr v31.16b, v24.16b // ..................*.................................... + eor v10.16B, v8.16B, v17.16B // ...................*................................... + pmull2 v1.1q, v9.2d, v12.2d // ...................*................................... + rev32 v9.16B, v13.16B // ....................*.................................. + aesr v31.16b, v25.16b // ....................*.................................. + aesr v2.16b, v23.16b // .....................*................................. + eor v17.16B, v10.16B, v29.16B // .....................*................................. + aesr v31.16b, v26.16b // ......................*................................ + eor v3.16B, v3.16B, v1.16B // ......................*................................ + aesr v6.16b, v18.16b // .......................*............................... + ext v8.16B, v29.16B, v29.16B, #8 // .......................*............................... + aese v31.16b, v27.16b // ........................*.............................. + aesr v6.16b, v19.16b // .........................*............................. + eor v14.16B, v8.16B, v14.16B // .........................*............................. + aesr v9.16b, v18.16b // ..........................*............................ + eor v31.16B, v31.16B, v28.16B // ..........................*............................ + eor v11.16B, v3.16B, v15.16B // ...........................*........................... + aesr v6.16b, v20.16b // ...........................*........................... + eor v8.16B, v31.16B, v0.16B // ............................*.......................... + aesr v9.16b, v19.16b // ............................*.......................... + eor v0.16B, v11.16B, v17.16B // .............................*......................... + aesr v2.16b, v24.16b // .............................*......................... + add v31.4S, v13.4S, v30.4S // ..............................*........................ + aesr v6.16b, v21.16b // ..............................*........................ + aesr v9.16b, v20.16b // ...............................*....................... + eor v14.16B, v0.16B, v14.16B // ...............................*....................... + aesr v6.16b, v22.16b // ................................*...................... + pmull v29.1q, v14.1d, v7.1d // .................................*..................... + aesr v9.16b, v21.16b // ..................................*.................... + ext v14.16B, v14.16B, v14.16B, #8 // ..................................*.................... + aesr v6.16b, v23.16b // ...................................*................... + eor v12.16B, v10.16B, v29.16B // ...................................*................... + str q8, [x2, #16] // ....................................*.................. + aesr v9.16b, v22.16b // ....................................*.................. + eor v10.16B, v12.16B, v14.16B // .....................................*................. + aesr v2.16b, v25.16b // .....................................*................. + aesr v6.16b, v24.16b // ......................................*................ + aesr v9.16b, v23.16b // .......................................*............... + aesr v6.16b, v25.16b // ........................................*.............. + aesr v9.16b, v24.16b // .........................................*............. + aesr v6.16b, v26.16b // ..........................................*............ + aesr v9.16b, v25.16b // ...........................................*........... + aese v6.16b, v27.16b // ............................................*.......... + aesr v2.16b, v26.16b // .............................................*......... + ext v11.16B, v10.16B, v10.16B, #8 // .............................................*......... + aesr v9.16b, v26.16b // ..............................................*........ + eor v29.16B, v6.16B, v28.16B // ..............................................*........ + aese v2.16b, v27.16b // ...............................................*....... + eor v29.16B, v29.16B, v16.16B // ................................................*...... + aese v9.16b, v27.16b // ................................................*...... + eor v3.16B, v2.16B, v28.16B // .................................................*..... + str q29, [x2, #32] // ..................................................*.... + eor v14.16B, v9.16B, v28.16B // ..................................................*.... + eor v12.16B, v3.16B, v5.16B // ...................................................*... + eor v29.16B, v14.16B, v4.16B // ....................................................*.. + str q12, [x2], #(4*16) // .....................................................*. + str q29, [x2, #-16] // ......................................................* + + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // pmull v1.1q, v3.1d, v8.1d // .*..................................................... + // eor v2.16B, v12.16B, v29.16B // *...................................................... + // rev32 v12.16B, v31.16B // .*..................................................... + // pmull v8.1q, v13.1d, v9.1d // ....*.................................................. + // pmull2 v29.1q, v13.2d, v9.2d // *...................................................... + // add v9.4S, v31.4S, v30.4S // ..*.................................................... + // aesr v12.16b, v18.16b // ...*................................................... + // eor v11.16B, v1.16B, v17.16B // ...*................................................... + // pmull v1.1q, v15.1d, v6.1d // ..*.................................................... + // rev32 v31.16B, v9.16B // ....*.................................................. + // aesr v12.16b, v19.16b // .....*................................................. + // ext v15.16B, v3.16B, v3.16B, #8 // .....*................................................. + // eor v14.16B, v11.16B, v14.16B // ......*................................................ + // aesr v31.16b, v18.16b // ......*................................................ + // eor v3.16B, v15.16B, v3.16B // .......*............................................... + // aesr v12.16b, v20.16b // .......*............................................... + // add v11.4S, v9.4S, v30.4S // ........*.............................................. + // aesr v31.16b, v19.16b // ........*.............................................. + // eor v29.16B, v2.16B, v29.16B // .........*............................................. + // pmull2 v15.1q, v3.2d, v6.2d // .........*............................................. + // aesr v12.16b, v21.16b // .............*......................................... + // mov d3, v13.d[1] // ..........*............................................ + // eor v1.16B, v15.16B, v1.16B // ...........*........................................... + // aesr v31.16b, v20.16b // ..........*............................................ + // aesr v12.16b, v22.16b // ...............*....................................... + // ext v9.16B, v10.16B, v10.16B, #8 // ............*.......................................... + // aesr v31.16b, v21.16b // ............*.......................................... + // eor v3.8B, v3.8B, v13.8B // ...............*....................................... + // eor v10.16B, v9.16B, v10.16B // ................*...................................... + // aesr v12.16b, v23.16b // .....................*................................. + // rev32 v15.16B, v11.16B // .................*..................................... + // aesr v31.16b, v22.16b // ..............*........................................ + // aesr v12.16b, v24.16b // .............................*......................... + // add v11.4S, v11.4S, v30.4S // ..................*.................................... + // eor v17.16B, v14.16B, v8.16B // ...................*................................... + // aesr v31.16b, v23.16b // ................*...................................... + // aesr v12.16b, v25.16b // .....................................*................. + // rev32 v9.16B, v11.16B // ....................*.................................. + // aesr v31.16b, v24.16b // ..................*.................................... + // ldr q13, [x6, #16] // .............*......................................... + // aesr v12.16b, v26.16b // .............................................*......... + // aesr v31.16b, v25.16b // ....................*.................................. + // aese v12.16b, v27.16b // ...............................................*....... + // pmull2 v6.1q, v10.2d, v13.2d // ...................*................................... + // eor v10.16B, v17.16B, v29.16B // .....................*................................. + // eor v12.16B, v12.16B, v28.16B // .................................................*..... + // aesr v15.16b, v18.16b // .......................*............................... + // eor v2.16B, v1.16B, v6.16B // ......................*................................ + // aesr v31.16b, v26.16b // ......................*................................ + // eor v1.16B, v12.16B, v5.16B // ...................................................*... + // pmull v12.1q, v29.1d, v7.1d // ...........*........................................... + // ext v6.16B, v29.16B, v29.16B, #8 // .......................*............................... + // aesr v9.16b, v18.16b // ..........................*............................ + // aesr v15.16b, v19.16b // .........................*............................. + // aesr v9.16b, v19.16b // ............................*.......................... + // str q1, [x2], #(4*16) // .....................................................*. + // aese v31.16b, v27.16b // ........................*.............................. + // pmull v29.1q, v3.1d, v13.1d // .................*..................................... + // eor v3.16B, v6.16B, v12.16B // .........................*............................. + // aesr v15.16b, v20.16b // ...........................*........................... + // eor v6.16B, v31.16B, v28.16B // ..........................*............................ + // aesr v9.16b, v20.16b // ...............................*....................... + // eor v12.16B, v2.16B, v29.16B // ...........................*........................... + // aesr v15.16b, v21.16b // ..............................*........................ + // eor v29.16B, v6.16B, v0.16B // ............................*.......................... + // aesr v9.16b, v21.16b // ..................................*.................... + // aesr v15.16b, v22.16b // ................................*...................... + // eor v12.16B, v12.16B, v10.16B // .............................*......................... + // aesr v9.16b, v22.16b // ....................................*.................. + // add v31.4S, v11.4S, v30.4S // ..............................*........................ + // aesr v15.16b, v23.16b // ...................................*................... + // eor v3.16B, v12.16B, v3.16B // ...............................*....................... + // aesr v9.16b, v23.16b // .......................................*............... + // aesr v15.16b, v24.16b // ......................................*................ + // ext v11.16B, v3.16B, v3.16B, #8 // ..................................*.................... + // pmull v3.1q, v3.1d, v7.1d // .................................*..................... + // aesr v15.16b, v25.16b // ........................................*.............. + // eor v17.16B, v17.16B, v3.16B // ...................................*................... + // aesr v9.16b, v24.16b // .........................................*............. + // str q29, [x2, #-48] // ....................................*.................. + // aesr v15.16b, v26.16b // ..........................................*............ + // eor v11.16B, v17.16B, v11.16B // .....................................*................. + // aesr v9.16b, v25.16b // ...........................................*........... + // aese v15.16b, v27.16b // ............................................*.......... + // ext v11.16B, v11.16B, v11.16B, #8 // .............................................*......... + // aesr v9.16b, v26.16b // ..............................................*........ + // eor v15.16B, v15.16B, v28.16B // ..............................................*........ + // aese v9.16b, v27.16b // ................................................*...... + // eor v15.16B, v15.16B, v16.16B // ................................................*...... + // eor v12.16B, v9.16B, v28.16B // ..................................................*.... + // str q15, [x2, #-32] // ..................................................*.... + // eor v12.16B, v12.16B, v4.16B // ....................................................*.. + // str q12, [x2, #-16] // ......................................................* + + b Lloop_unrolled_start_end +Lloop_unrolled_start_end: +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res + str res_q, [output], #16 + + ghash_init_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + // Store updated counter + rev32 rtmp_ctr.16b, rtmp_ctr.16b + str rtmp_ctr_q, [ivec] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https: // www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif \ No newline at end of file diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_keep_htable.S new file mode 100644 index 00000000000..022a51fb464 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_keep_htable.S @@ -0,0 +1,1046 @@ +// Copyright (c) 2022, ARM Inc. + +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. + +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker + +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https: // github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . + +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https: // github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +aes_st0 .req v0 +aes_st0_q .req q0 +aes_st1 .req v1 +aes_st1_q .req q1 +aes_st2 .req v2 +aes_st2_q .req q2 +aes_st3 .req v3 +aes_st3_q .req q3 + +res0 .req v0 +res0_q .req q0 +res1 .req v1 +res1_q .req q1 +res2 .req v2 +res2_q .req q2 +res3 .req v3 +res3_q .req q3 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht3q .req q15 +Ht4q .req q16 +Ht34q .req q17 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3 .req v15 +Ht4 .req v16 +Ht34 .req v17 + +Ht5q .req Ht3q +Ht6q .req Ht4q +Ht56q .req Ht34q + +Ht5 .req Ht3 +Ht6 .req Ht4 +Ht56 .req Ht34 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +rctr_inc .req v30 +rtmp_ctr .req v31 +rtmp_ctr_q .req q31 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldr rtmp_ctr_q, [ivec] + + mov constant_temp, #0x100000000 // set up counter increment + movi rctr_inc.16b, #0x0 + fmov rctr_inc.d[1], constant_temp + + rev32 rtmp_ctr.16b, rtmp_ctr.16b +.endm + +.macro aes_ctr_inc + add rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s +.endm + +.macro next_ctr_init_aes_st aes_st + rev32 \aes_st\().16b, rtmp_ctr.16b + aes_ctr_inc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_x4 data0, data1, data2, data3, key + aesr \data0\(), \key\() + aesr \data1\(), \key\() + aesr \data2\(), \key\() + aesr \data3\(), \key\() +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_0_8_x4 data0, data1, data2, data3, key + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()0.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()1.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()2.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()3.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()4.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()5.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()6.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()7.16b + aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro load_round_key i + ldr rk\()\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro prepare_loop_counts + mov unroll, #UNROLL + lsr full_blocks, byte_len, #4 + udiv count, full_blocks, unroll + msub remainder, count, unroll, full_blocks +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_SIGN_LINK_REGISTER + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + + load_htable_34 + load_htable_12 + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end + // Instructions: 59 + // Expected cycles: 38 + // Expected IPC: 1.55 + // + // Cycle bound: 38.0 + // IPC bound: 1.55 + // + // Wall time: 0.45s + // User time: 0.45s + // + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + add v9.4S, v31.4S, v30.4S // *..................................... + rev32 v6.16B, v31.16B // *..................................... + ldr q5, [x0, #48] // .*.................................... + ldr q10, [x0], #(4*16) // .*.................................... + add v8.4S, v9.4S, v30.4S // ..*................................... + rev32 v2.16B, v9.16B // ..*................................... + aesr v6.16b, v18.16b // ...*.................................. + ldr q29, [x0, #-32] // ...*.................................. + rev32 v9.16B, v8.16B // ....*................................. + add v31.4S, v8.4S, v30.4S // ....*................................. + aesr v2.16b, v18.16b // .....*................................ + rev64 v8.16B, v10.16B // .....*................................ + aesr v9.16b, v18.16b // ......*............................... + rev32 v4.16B, v31.16B // ......*............................... + eor v8.16B, v8.16B, v11.16B // .......*.............................. + aesr v2.16b, v19.16b // .......*.............................. + aesr v4.16b, v18.16b // ........*............................. + rev64 v3.16B, v5.16B // ........*............................. + aesr v9.16b, v19.16b // .........*............................ + rev64 v1.16B, v29.16B // .........*............................ + add v31.4S, v31.4S, v30.4S // ..........*........................... + aesr v4.16b, v19.16b // ..........*........................... + aesr v9.16b, v20.16b // ...........*.......................... + ldr q0, [x0, #-48] // ...........*.......................... + aesr v4.16b, v20.16b // ............*......................... + ext v11.16B, v1.16B, v1.16B, #8 // ............*......................... + aesr v9.16b, v21.16b // .............*........................ + aesr v4.16b, v21.16b // ..............*....................... + aesr v9.16b, v22.16b // ...............*...................... + aesr v4.16b, v22.16b // ................*..................... + aesr v9.16b, v23.16b // .................*.................... + aesr v4.16b, v23.16b // ..................*................... + aesr v2.16b, v20.16b // ...................*.................. + aesr v4.16b, v24.16b // ....................*................. + aesr v9.16b, v24.16b // .....................*................ + aesr v4.16b, v25.16b // ......................*............... + aesr v2.16b, v21.16b // .......................*.............. + aesr v4.16b, v26.16b // ........................*............. + aesr v9.16b, v25.16b // .........................*............ + aese v4.16b, v27.16b // ..........................*........... + aesr v2.16b, v22.16b // ...........................*.......... + aesr v9.16b, v26.16b // ............................*......... + eor v4.16B, v4.16B, v28.16B // ............................*......... + aesr v2.16b, v23.16b // .............................*........ + eor v4.16B, v4.16B, v5.16B // ..............................*....... + aese v9.16b, v27.16b // ..............................*....... + eor v5.16B, v11.16B, v1.16B // ...............................*...... + aesr v2.16b, v24.16b // ...............................*...... + aesr v6.16b, v19.16b // ................................*..... + eor v9.16B, v9.16B, v28.16B // ................................*..... + aesr v2.16b, v25.16b // .................................*.... + ext v11.16B, v8.16B, v8.16B, #8 // .................................*.... + eor v9.16B, v9.16B, v29.16B // ..................................*... + aesr v6.16b, v20.16b // ..................................*... + str q4, [x2, #48] // ...................................*.. + aesr v2.16b, v26.16b // ...................................*.. + str q9, [x2, #32] // ....................................*. + pmull v4.1q, v8.1d, v16.1d // ....................................*. + aese v2.16b, v27.16b // .....................................* + + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + // add V.4S, v31.4S, v30.4S // *..................................... + // add V.4S, V.4S, v30.4S // ..*................................... + // rev32 V.16B, V.16B // ....*................................. + // add V.4S, V.4S, v30.4S // ....*................................. + // aesr V.16b, v18.16b // ......*............................... + // rev32 V.16B, V.16B // ......*............................... + // aesr V.16b, v19.16b // .........*............................ + // aesr V.16b, v18.16b // ........*............................. + // aesr V.16b, v20.16b // ...........*.......................... + // aesr V.16b, v19.16b // ..........*........................... + // rev32 v2.16B, V.16B // ..*................................... + // aesr V.16b, v21.16b // .............*........................ + // aesr V.16b, v20.16b // ............*......................... + // aesr v2.16b, v18.16b // .....*................................ + // aesr V.16b, v22.16b // ...............*...................... + // aesr V.16b, v21.16b // ..............*....................... + // aesr V.16b, v22.16b // ................*..................... + // aesr v2.16b, v19.16b // .......*.............................. + // aesr V.16b, v23.16b // ..................*................... + // ldr Q, [x0, #48] // .*.................................... + // aesr V.16b, v23.16b // .................*.................... + // aesr V.16b, v24.16b // ....................*................. + // ldr Q, [x0, #32] // ...*.................................. + // aesr v2.16b, v20.16b // ...................*.................. + // rev64 v3.16B, V.16B // ........*............................. + // aesr V.16b, v25.16b // ......................*............... + // rev32 v6.16B, v31.16B // *..................................... + // aesr V.16b, v24.16b // .....................*................ + // aesr V.16b, v26.16b // ........................*............. + // rev64 v1.16B, V.16B // .........*............................ + // add v31.4S, V.4S, v30.4S // ..........*........................... + // aesr v2.16b, v21.16b // .......................*.............. + // aese V.16b, v27.16b // ..........................*........... + // ldr q10, [x0], #(4*16) // .*.................................... + // aesr v2.16b, v22.16b // ...........................*.......... + // ldr q0, [x0, #-48] // ...........*.......................... + // aesr V.16b, v25.16b // .........................*............ + // eor V.16B, V.16B, v28.16B // ............................*......... + // aesr v2.16b, v23.16b // .............................*........ + // eor V.16B, V.16B, V.16B // ..............................*....... + // aesr V.16b, v26.16b // ............................*......... + // aesr v2.16b, v24.16b // ...............................*...... + // rev64 V.16B, v10.16B // .....*................................ + // str Q, [x2, #48] // ...................................*.. + // aese V.16b, v27.16b // ..............................*....... + // aesr v6.16b, v18.16b // ...*.................................. + // ext V.16B, v1.16B, v1.16B, #8 // ............*......................... + // aesr v2.16b, v25.16b // .................................*.... + // eor V.16B, V.16B, v28.16B // ................................*..... + // eor v8.16B, V.16B, v11.16B // .......*.............................. + // aesr v6.16b, v19.16b // ................................*..... + // eor V.16B, V.16B, V.16B // ..................................*... + // aesr v2.16b, v26.16b // ...................................*.. + // pmull v4.1q, v8.1d, v16.1d // ....................................*. + // ext v11.16B, v8.16B, v8.16B, #8 // .................................*.... + // str Q, [x2, #32] // ....................................*. + // aese v2.16b, v27.16b // .....................................* + // eor v5.16B, V.16B, v1.16B // ...............................*...... + // aesr v6.16b, v20.16b // ..................................*... + + sub count, count, #1 +cbz count, Lloop_unrolled_start_postamble +Lloop_unrolled_start: + // Instructions: 109 + // Expected cycles: 55 + // Expected IPC: 1.98 + // + // Cycle bound: 54.0 + // IPC bound: 2.02 + // + // Wall time: 3601.77s + // User time: 3601.77s + // + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + eor v9.16B, v2.16B, v28.16B // *...................................................... + pmull v2.1q, v1.1d, v13.1d // *...................................................... + eor v29.16B, v11.16B, v8.16B // .*..................................................... + rev64 v11.16B, v0.16B // .*..................................................... + eor v9.16B, v9.16B, v0.16B // ..*.................................................... + pmull2 v0.1q, v8.2d, v16.2d // ..*.................................................... + pmull2 v8.1q, v1.2d, v13.2d // ...*................................................... + mov d1, v11.d[1] // ...*................................................... + str q9, [x2, #16] // ....*.................................................. + pmull v9.1q, v11.1d, v15.1d // ....*.................................................. + eor v1.8B, v1.8B, v11.8B // .....*................................................. + pmull2 v11.1q, v11.2d, v15.2d // .....*................................................. + aesr v6.16b, v21.16b // ......*................................................ + eor v9.16B, v4.16B, v9.16B // ......*................................................ + pmull2 v29.1q, v29.2d, v17.2d // .......*............................................... + eor v4.16B, v0.16B, v11.16B // .......*............................................... + pmull v0.1q, v1.1d, v17.1d // ........*.............................................. + add v11.4S, v31.4S, v30.4S // ........e.............................................. + eor v1.16B, v4.16B, v8.16B // .........*............................................. + aesr v6.16b, v22.16b // .........*............................................. + eor v8.16B, v9.16B, v2.16B // ..........*............................................ + pmull2 v9.1q, v5.2d, v14.2d // ..........*............................................ + aesr v6.16b, v23.16b // ...........*........................................... + mov d2, v3.d[1] // ...........*........................................... + eor v5.16B, v29.16B, v0.16B // ............*.......................................... + pmull v0.1q, v3.1d, v12.1d // ............*.......................................... + pmull2 v4.1q, v3.2d, v12.2d // .............*......................................... + eor v3.8B, v2.8B, v3.8B // .............*......................................... + aesr v6.16b, v24.16b // ..............*........................................ + add v29.4S, v11.4S, v30.4S // ..............e........................................ + pmull v3.1q, v3.1d, v14.1d // ...............*....................................... + eor v2.16B, v1.16B, v4.16B // ...............*....................................... + eor v4.16B, v5.16B, v9.16B // ................*...................................... + aesr v6.16b, v25.16b // ................*...................................... + rev32 v9.16B, v29.16B // .................e..................................... + pmull v1.1q, v2.1d, v7.1d // .................*..................................... + add v29.4S, v29.4S, v30.4S // ..................e.................................... + aesr v6.16b, v26.16b // ..................*.................................... + eor v0.16B, v8.16B, v0.16B // ...................*................................... + aesr v9.16b, v18.16b // ...................e................................... + aese v6.16b, v27.16b // ....................*.................................. + rev32 v8.16B, v29.16B // ....................e.................................. + aesr v9.16b, v19.16b // .....................e................................. + eor v5.16B, v0.16B, v2.16B // .....................*................................. + ext v2.16B, v2.16B, v2.16B, #8 // ......................*................................ + aesr v8.16b, v18.16b // ......................e................................ + eor v3.16B, v4.16B, v3.16B // .......................*............................... + aesr v9.16b, v20.16b // .......................e............................... + eor v1.16B, v2.16B, v1.16B // ........................*.............................. + aesr v8.16b, v19.16b // ........................e.............................. + rev32 v2.16B, v11.16B // .........................e............................. + aesr v9.16b, v21.16b // .........................e............................. + eor v3.16B, v3.16B, v5.16B // ..........................*............................ + aesr v8.16b, v20.16b // ..........................e............................ + eor v6.16B, v6.16B, v28.16B // ...........................*........................... + aesr v2.16b, v18.16b // ...........................e........................... + eor v1.16B, v3.16B, v1.16B // ............................*.......................... + aesr v9.16b, v22.16b // ............................e.......................... + aesr v8.16b, v21.16b // .............................e......................... + eor v10.16B, v6.16B, v10.16B // .............................*......................... + pmull v5.1q, v1.1d, v7.1d // ..............................*........................ + ext v4.16B, v1.16B, v1.16B, #8 // ..............................*........................ + str q10, [x2], #(4*16) // ...............................*....................... + aesr v8.16b, v22.16b // ...............................e....................... + eor v11.16B, v0.16B, v5.16B // ................................*...................... + aesr v2.16b, v19.16b // ................................e...................... + aesr v8.16b, v23.16b // .................................e..................... + ldr q5, [x0, #48] // .................................e..................... + eor v10.16B, v11.16B, v4.16B // ..................................*.................... + aesr v9.16b, v23.16b // ..................................e.................... + aesr v8.16b, v24.16b // ...................................e................... + ldr q4, [x0, #32] // ...................................e................... + aesr v2.16b, v20.16b // ....................................e.................. + ext v11.16B, v10.16B, v10.16B, #8 // ....................................*.................. + rev64 v3.16B, v5.16B // .....................................e................. + aesr v8.16b, v25.16b // .....................................e................. + rev32 v6.16B, v31.16B // ......................................e................ + aesr v9.16b, v24.16b // ......................................e................ + aesr v8.16b, v26.16b // .......................................e............... + rev64 v1.16B, v4.16B // .......................................e............... + add v31.4S, v29.4S, v30.4S // ........................................e.............. + aesr v2.16b, v21.16b // ........................................e.............. + aese v8.16b, v27.16b // .........................................e............. + ldr q10, [x0], #(4*16) // .........................................e............. + aesr v2.16b, v22.16b // ..........................................e............ + ldr q0, [x0, #-48] // ..........................................e............ + aesr v9.16b, v25.16b // ...........................................e........... + eor v8.16B, v8.16B, v28.16B // ...........................................e........... + aesr v2.16b, v23.16b // ............................................e.......... + eor v8.16B, v8.16B, v5.16B // .............................................e......... + aesr v9.16b, v26.16b // .............................................e......... + aesr v2.16b, v24.16b // ..............................................e........ + rev64 v29.16B, v10.16B // ..............................................e........ + str q8, [x2, #48] // ...............................................e....... + aese v9.16b, v27.16b // ...............................................e....... + aesr v6.16b, v18.16b // ................................................e...... + ext v5.16B, v1.16B, v1.16B, #8 // ................................................e...... + aesr v2.16b, v25.16b // .................................................e..... + eor v9.16B, v9.16B, v28.16B // .................................................e..... + eor v8.16B, v29.16B, v11.16B // ..................................................e.... + aesr v6.16b, v19.16b // ..................................................e.... + eor v9.16B, v9.16B, v4.16B // ...................................................e... + aesr v2.16b, v26.16b // ...................................................e... + pmull v4.1q, v8.1d, v16.1d // ....................................................e.. + ext v11.16B, v8.16B, v8.16B, #8 // ....................................................e.. + str q9, [x2, #32] // .....................................................e. + aese v2.16b, v27.16b // .....................................................e. + eor v5.16B, v5.16B, v1.16B // ......................................................e + aesr v6.16b, v20.16b // ......................................................e + + // -------------------------------- cycle (expected) ---------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------- + // rev32 v0.16b, v31.16b // ..............................e................'.................................... + // add v31.4s, v31.4s, v30.4s // e..............................................'.......~............................ + // rev32 v1.16b, v31.16b // .................e.............................'........................~........... + // add v31.4s, v31.4s, v30.4s // ......e........................................'.............~...................... + // rev32 v2.16b, v31.16b // .........e.....................................'................~................... + // add v31.4s, v31.4s, v30.4s // ..........e....................................'.................~.................. + // rev32 v3.16b, v31.16b // ............e..................................'...................~................ + // add v31.4s, v31.4s, v30.4s // ................................e..............'.................................... + // aesr v0.16b, v18.16b // ........................................e......'.................................... + // aesr v0.16b, v19.16b // ..........................................e....'.................................... + // aesr v0.16b, v20.16b // ..............................................e'.................................... + // aesr v0.16b, v21.16b // ...............................................'.....*.............................. + // aesr v0.16b, v22.16b // .~.............................................'........*........................... + // aesr v0.16b, v23.16b // ...~...........................................'..........*......................... + // aesr v0.16b, v24.16b // ......~........................................'.............*...................... + // aesr v0.16b, v25.16b // ........~......................................'...............*.................... + // aesr v0.16b, v26.16b // ..........~....................................'.................*.................. + // aesr v1.16b, v18.16b // ...................e...........................'..........................~......... + // aesr v1.16b, v19.16b // ........................e......................'...............................~.... + // aesr v1.16b, v20.16b // ............................e..................'.................................... + // aesr v1.16b, v21.16b // ................................e..............'.................................... + // aesr v1.16b, v22.16b // ..................................e............'.................................... + // aesr v1.16b, v23.16b // ....................................e..........'.................................... + // aesr v1.16b, v24.16b // ......................................e........'.................................... + // aesr v1.16b, v25.16b // .........................................e.....'.................................... + // aesr v1.16b, v26.16b // ...........................................e...'.................................... + // aesr v2.16b, v18.16b // ...........e...................................'..................~................. + // aesr v2.16b, v19.16b // .............e.................................'....................~............... + // aesr v2.16b, v20.16b // ...............e...............................'......................~............. + // aesr v2.16b, v21.16b // .................e.............................'........................~........... + // aesr v2.16b, v22.16b // ....................e..........................'...........................~........ + // aesr v2.16b, v23.16b // ..........................e....................'.................................~.. + // aesr v2.16b, v24.16b // ..............................e................'.................................... + // aesr v2.16b, v25.16b // ...................................e...........'.................................... + // aesr v2.16b, v26.16b // .....................................e.........'.................................... + // aesr v3.16b, v18.16b // ..............e................................'.....................~.............. + // aesr v3.16b, v19.16b // ................e..............................'.......................~............ + // aesr v3.16b, v20.16b // ..................e............................'.........................~.......... + // aesr v3.16b, v21.16b // .....................e.........................'............................~....... + // aesr v3.16b, v22.16b // .......................e.......................'..............................~..... + // aesr v3.16b, v23.16b // .........................e.....................'................................~... + // aesr v3.16b, v24.16b // ...........................e...................'..................................~. + // aesr v3.16b, v25.16b // .............................e.................'.................................... + // aesr v3.16b, v26.16b // ...............................e...............'.................................... + // ldr q29, [x0], #(4*16) // .................................e.............'.................................... + // aese v0.16b, v27.16b // ............~..................................'...................*................ + // eor v0.16b, v0.16b, v28.16b // ...................~...........................'..........................*......... + // eor v0.16b, v0.16b, v29.16b // .....................~.........................'............................*....... + // str q0, [x2], #(4*16) // .......................~.......................'..............................*..... + // rev64 v29.16b, v29.16b // ......................................e........'.................................... + // eor v29.16b, v29.16b, v11.16b // ..........................................e....'.................................... + // pmull v8.1q, v29.1d, v16.1d // ............................................e..'.................................... + // pmull2 v9.1q, v29.2d, v16.2d // ...............................................'.*.................................. + // ext v11.16b, v29.16b, v29.16b, #8 // ............................................e..'.................................... + // eor v11.16b, v11.16b, v29.16b // ...............................................'*................................... + // pmull2 v10.1q, v11.2d, v17.2d // ...............................................'......*............................. + // ldr q29, [x0, #(-3*16)] // ..................................e............'.................................... + // aese v1.16b, v27.16b // .............................................e.'.................................... + // eor v1.16b, v1.16b, v28.16b // ...............................................*.................................... + // eor v1.16b, v1.16b, v29.16b // ...............................................'.*.................................. + // str q1, [x2, #(-3*16)] // ...............................................'...*................................ + // rev64 v29.16b, v29.16b // ...............................................'*................................... + // pmull v11.1q, v29.1d, v15.1d // ...............................................'...*................................ + // eor v8.16b, v8.16b, v11.16b // ...............................................'.....*.............................. + // pmull2 v11.1q, v29.2d, v15.2d // ...............................................'....*............................... + // eor v9.16b, v9.16b, v11.16b // ...............................................'......*............................. + // mov d11, v29.d[1] // ...............................................'..*................................. + // eor v11.8b, v11.8b, v29.8b // ...............................................'....*............................... + // pmull v11.1q, v11.1d, v17.1d // ~..............................................'.......*............................ + // eor v10.16b, v10.16b, v11.16b // ....~..........................................'...........*........................ + // ldr q29, [x0, #(-2*16)] // ...........................e...................'..................................~. + // aese v2.16b, v27.16b // .......................................e.......'.................................... + // eor v2.16b, v2.16b, v28.16b // .........................................e.....'.................................... + // eor v2.16b, v2.16b, v29.16b // ...........................................e...'.................................... + // str q2, [x2, #(-2*16)] // .............................................e.'.................................... + // rev64 v29.16b, v29.16b // ...............................e...............'.................................... + // pmull v11.1q, v29.1d, v13.1d // ...............................................*.................................... + // eor v8.16b, v8.16b, v11.16b // ..~............................................'.........*.......................... + // pmull2 v11.1q, v29.2d, v13.2d // ...............................................'..*................................. + // eor v9.16b, v9.16b, v11.16b // .~.............................................'........*........................... + // ext v11.16b, v29.16b, v29.16b, #8 // ........................................e......'.................................... + // eor v11.16b, v11.16b, v29.16b // ..............................................e'.................................... + // pmull2 v11.1q, v11.2d, v14.2d // ..~............................................'.........*.......................... + // eor v10.16b, v10.16b, v11.16b // ........~......................................'...............*.................... + // ldr q29, [x0, #(-1*16)] // .........................e.....................'................................~... + // aese v3.16b, v27.16b // .................................e.............'.................................... + // eor v3.16b, v3.16b, v28.16b // ...................................e...........'.................................... + // eor v3.16b, v3.16b, v29.16b // .....................................e.........'.................................... + // str q3, [x2, #(-1*16)] // .......................................e.......'.................................... + // rev64 v29.16b, v29.16b // .............................e.................'.................................... + // pmull v11.1q, v29.1d, v12.1d // ....~..........................................'...........*........................ + // eor v8.16b, v8.16b, v11.16b // ...........~...................................'..................*................. + // pmull2 v11.1q, v29.2d, v12.2d // .....~.........................................'............*....................... + // eor v9.16b, v9.16b, v11.16b // .......~.......................................'..............*..................... + // mov d11, v29.d[1] // ...~...........................................'..........*......................... + // eor v11.8b, v11.8b, v29.8b // .....~.........................................'............*....................... + // pmull v11.1q, v11.1d, v14.1d // .......~.......................................'..............*..................... + // eor v10.16b, v10.16b, v11.16b // ...............~...............................'......................*............. + // eor v0.16b, v8.16b, v9.16b // .............~.................................'....................*............... + // pmull v1.1q, v9.1d, v7.1d // .........~.....................................'................*................... + // ext v9.16b, v9.16b, v9.16b, #8 // ..............~................................'.....................*.............. + // eor v10.16b, v10.16b, v0.16b // ..................~............................'.........................*.......... + // eor v1.16b, v9.16b, v1.16b // ................~..............................'.......................*............ + // eor v10.16b, v10.16b, v1.16b // ....................~..........................'...........................*........ + // pmull v9.1q, v10.1d, v7.1d // ......................~........................'.............................*...... + // eor v8.16b, v8.16b, v9.16b // ........................~......................'...............................*.... + // ext v10.16b, v10.16b, v10.16b, #8 // ......................~........................'.............................*...... + // eor v11.16b, v8.16b, v10.16b // ..........................~....................'.................................*.. + // ext v11.16b, v11.16b, v11.16b, #8 // ............................~..................'...................................* + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_start_postamble:// end of loop kernel + // Instructions: 50 + // Expected cycles: 30 + // Expected IPC: 1.67 + // + // Cycle bound: 30.0 + // IPC bound: 1.67 + // + // Wall time: 706.38s + // User time: 706.38s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + eor v29.16B, v2.16B, v28.16B // *............................. + pmull2 v2.1q, v1.2d, v13.2d // *............................. + pmull v1.1q, v1.1d, v13.1d // .*............................ + rev64 v9.16B, v0.16B // .*............................ + aesr v6.16b, v21.16b // ..*........................... + eor v0.16B, v29.16B, v0.16B // ..*........................... + eor v11.16B, v11.16B, v8.16B // ...*.......................... + pmull v29.1q, v9.1d, v15.1d // ...*.......................... + str q0, [x2, #16] // ....*......................... + pmull2 v0.1q, v9.2d, v15.2d // ....*......................... + pmull2 v8.1q, v8.2d, v16.2d // .....*........................ + eor v29.16B, v4.16B, v29.16B // .....*........................ + pmull2 v4.1q, v11.2d, v17.2d // ......*....................... + mov d11, v9.d[1] // ......*....................... + eor v29.16B, v29.16B, v1.16B // .......*...................... + pmull v1.1q, v3.1d, v12.1d // .......*...................... + eor v11.8B, v11.8B, v9.8B // ........*..................... + eor v8.16B, v8.16B, v0.16B // ........*..................... + aesr v6.16b, v22.16b // .........*.................... + eor v9.16B, v29.16B, v1.16B // .........*.................... + pmull v1.1q, v11.1d, v17.1d // ..........*................... + eor v29.16B, v8.16B, v2.16B // ..........*................... + pmull2 v0.1q, v3.2d, v12.2d // ...........*.................. + mov d2, v3.d[1] // ...........*.................. + pmull2 v11.1q, v5.2d, v14.2d // ............*................. + eor v8.16B, v4.16B, v1.16B // ............*................. + eor v29.16B, v29.16B, v0.16B // .............*................ + eor v2.8B, v2.8B, v3.8B // .............*................ + aesr v6.16b, v23.16b // ..............*............... + eor v8.16B, v8.16B, v11.16B // ..............*............... + eor v0.16B, v9.16B, v29.16B // ...............*.............. + pmull v2.1q, v2.1d, v14.1d // ...............*.............. + aesr v6.16b, v24.16b // ................*............. + ext v4.16B, v29.16B, v29.16B, #8 // ................*............. + eor v5.16B, v8.16B, v2.16B // .................*............ + pmull v3.1q, v29.1d, v7.1d // .................*............ + aesr v6.16b, v25.16b // ..................*........... + eor v5.16B, v5.16B, v0.16B // ...................*.......... + eor v8.16B, v4.16B, v3.16B // ...................*.......... + aesr v6.16b, v26.16b // ....................*......... + eor v8.16B, v5.16B, v8.16B // .....................*........ + aese v6.16b, v27.16b // ......................*....... + pmull v5.1q, v8.1d, v7.1d // .......................*...... + eor v29.16B, v6.16B, v28.16B // ........................*..... + eor v9.16B, v9.16B, v5.16B // .........................*.... + ext v8.16B, v8.16B, v8.16B, #8 // .........................*.... + eor v29.16B, v29.16B, v10.16B // ..........................*... + eor v10.16B, v9.16B, v8.16B // ...........................*.. + str q29, [x2], #(4*16) // ............................*. + ext v11.16B, v10.16B, v10.16B, #8 // .............................* + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // eor V.16B, v2.16B, v28.16B // *.............................. + // pmull V.1q, v1.1d, v13.1d // .*............................. + // eor V.16B, v11.16B, v8.16B // ...*........................... + // rev64 V.16B, v0.16B // .*............................. + // eor V.16B, V.16B, v0.16B // ..*............................ + // pmull2 V.1q, v8.2d, v16.2d // .....*......................... + // pmull2 V.1q, v1.2d, v13.2d // *.............................. + // mov D, V.d[1] // ......*........................ + // str Q, [x2, #16] // ....*.......................... + // pmull V.1q, V.1d, v15.1d // ...*........................... + // eor V.8B, V.8B, V.8B // ........*...................... + // pmull2 V.1q, V.2d, v15.2d // ....*.......................... + // aesr v6.16b, v21.16b // ..*............................ + // eor V.16B, v4.16B, V.16B // .....*......................... + // pmull2 V.1q, V.2d, v17.2d // ......*........................ + // eor V.16B, V.16B, V.16B // ........*...................... + // pmull V.1q, V.1d, v17.1d // ..........*.................... + // eor V.16B, V.16B, V.16B // ..........*.................... + // aesr v6.16b, v22.16b // .........*..................... + // eor V.16B, V.16B, V.16B // .......*....................... + // pmull2 V.1q, v5.2d, v14.2d // ............*.................. + // aesr v6.16b, v23.16b // ..............*................ + // mov D, v3.d[1] // ...........*................... + // eor V.16B, V.16B, V.16B // ............*.................. + // pmull V.1q, v3.1d, v12.1d // .......*....................... + // pmull2 V.1q, v3.2d, v12.2d // ...........*................... + // eor V.8B, V.8B, v3.8B // .............*................. + // aesr v6.16b, v24.16b // ................*.............. + // pmull V.1q, V.1d, v14.1d // ...............*............... + // eor V.16B, V.16B, V.16B // .............*................. + // eor V.16B, V.16B, V.16B // ..............*................ + // aesr v6.16b, v25.16b // ..................*............ + // pmull V.1q, V.1d, v7.1d // .................*............. + // aesr v6.16b, v26.16b // ....................*.......... + // eor V.16B, V.16B, V.16B // .........*..................... + // aese v6.16b, v27.16b // ......................*........ + // eor V.16B, V.16B, V.16B // ...............*............... + // ext V.16B, V.16B, V.16B, #8 // ................*.............. + // eor V.16B, V.16B, V.16B // .................*............. + // eor V.16B, V.16B, V.16B // ...................*........... + // eor V.16B, V.16B, V.16B // ...................*........... + // eor V.16B, v6.16B, v28.16B // ........................*...... + // eor V.16B, V.16B, V.16B // .....................*......... + // eor V.16B, V.16B, v10.16B // ..........................*.... + // pmull V.1q, V.1d, v7.1d // .......................*....... + // ext V.16B, V.16B, V.16B, #8 // .........................*..... + // str Q, [x2], #(4*16) // ............................*.. + // eor V.16B, V.16B, V.16B // .........................*..... + // eor V.16B, V.16B, V.16B // ...........................*... + // ext v11.16B, V.16B, V.16B, #8 // .............................*. + + b Lloop_unrolled_start_end +Lloop_unrolled_start_end: +Lloop_unrolled_end: + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + next_ctr_init_aes_st aes_st0 + aesr_0_8 aes_st0, rk + + ldr plain_q, [input], #16 + aesr_final aes_st0, plain, res0 + str res0_q, [output], #16 + + ghash_init_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + // Store updated counter + rev32 rtmp_ctr.16b, rtmp_ctr.16b + str rtmp_ctr_q, [ivec] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https: // www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif \ No newline at end of file diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem2.S new file mode 100644 index 00000000000..620fab51c2b --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem2.S @@ -0,0 +1,1517 @@ +// Copyright (c) 2022, ARM Inc. + +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. + +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker + +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https: // github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . + +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https: // github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v11 +tag_q .req q11 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:writes=stack_0 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:writes=stack_1 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:writes=stack_2 + stp ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:writes=stack_3 + + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + rev ctr_tmp_w, ctr + str ctr_tmp_w, [sp, #(STACK_BASE_AES_ST + \loc*16 + 12)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc + add ctr, ctr, #1 +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end + cmp count, #1 + b.eq Lloop_unrolled_start_iter_1 + // Instructions: 140 + // Expected cycles: 68 + // Expected IPC: 2.06 + // + // Cycle bound: 68.0 + // IPC bound: 2.06 + // + // Wall time: 17.42s + // User time: 17.42s + // + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + ldr q29, [x0, #32] // *................................................................... + ldr q6, [x0, #16] // *................................................................... + add w24, w13, #1 // *................................................................... + rev w25, w13 // *................................................................... + ldr q0, [x0, #48] // .*.................................................................. + str w25, [sp, #STACK_BASE_AES_ST + 12] // .*.................................................................. // @slothy:writes=stack_0 + add w28, w24, #1 // .*.................................................................. + ldr q8, [x6] // ..*................................................................. + ldr q2, [x6, #48] // ..*................................................................. + rev w29, w28 // ..*................................................................. + add w19, w28, #1 // ..*................................................................. + ldr q5, [sp, #STACK_BASE_AES_ST] // ...*................................................................ // @slothy:reads=stack_0 + str w29, [sp, #STACK_BASE_AES_ST + 44] // ...*................................................................ // @slothy:writes=stack_2 + add w25, w19, #1 // ...*................................................................ + rev w12, w19 // ...*................................................................ + rev64 v10.16B, v6.16B // ....*............................................................... + ldr q3, [x6, #64] // ....*............................................................... + str w12, [sp, #STACK_BASE_AES_ST + 60] // ....*............................................................... // @slothy:writes=stack_3 + add w26, w25, #1 // ....*............................................................... + ldr q4, [sp, #STACK_BASE_AES_ST + 32] // .....*.............................................................. // @slothy:reads=stack_2 + rev64 v1.16B, v0.16B // .....*.............................................................. + rev w19, w26 // .....*.............................................................. + add w20, w26, #1 // .....*.............................................................. + pmull2 v14.1q, v10.2d, v2.2d // ......*............................................................. + mov d17, v10.d[1] // ......*............................................................. + rev w10, w20 // ......*............................................................. + aesr v5.16b, v18.16b // .......*............................................................ + ldr q15, [x0], #(4*16) // .......*............................................................ + str w10, [sp, #STACK_BASE_AES_ST + 44] // .......*............................................................ // @slothy:writes=stack_2 + rev w21, w25 // .......*............................................................ + pmull2 v16.1q, v1.2d, v8.2d // ........*........................................................... + eor v13.8B, v17.8B, v10.8B // ........*........................................................... + str w21, [sp, #STACK_BASE_AES_ST + 12] // ........*........................................................... // @slothy:writes=stack_0 + add w14, w20, #1 // ........*........................................................... + aesr v5.16b, v19.16b // .........*.......................................................... + add w13, w14, #1 // .........*.......................................................... + pmull v31.1q, v13.1d, v3.1d // ..........*......................................................... + rev64 v13.16B, v29.16B // ..........*......................................................... + rev64 v9.16B, v15.16B // ...........*........................................................ + aesr v5.16b, v20.16b // ...........*........................................................ + aesr v4.16b, v18.16b // ............*....................................................... + ldr q12, [x6, #80] // ............*....................................................... + aesr v5.16b, v21.16b // .............*...................................................... + eor v17.16B, v9.16B, v11.16B // .............*...................................................... + aesr v4.16b, v19.16b // ..............*..................................................... + ldr q11, [x6, #32] // ..............*..................................................... + rev w30, w24 // ..............*..................................................... + aesr v5.16b, v22.16b // ...............*.................................................... + ext v30.16B, v17.16B, v17.16B, #8 // ...............*.................................................... + aesr v4.16b, v20.16b // ................*................................................... + str w30, [sp, #STACK_BASE_AES_ST + 28] // ................*................................................... // @slothy:writes=stack_1 + pmull2 v9.1q, v17.2d, v12.2d // .................*.................................................. + eor v30.16B, v30.16B, v17.16B // .................*.................................................. + aesr v4.16b, v21.16b // ..................*................................................. + pmull2 v3.1q, v30.2d, v3.2d // ...................*................................................ + mov d30, v1.d[1] // ...................*................................................ + eor v9.16B, v9.16B, v14.16B // ....................*............................................... + aesr v4.16b, v22.16b // ....................*............................................... + pmull v17.1q, v17.1d, v12.1d // .....................*.............................................. + ldr q14, [sp, #STACK_BASE_AES_ST + 32] // .....................*.............................................. // @slothy:reads=stack_2 + eor v3.16B, v3.16B, v31.16B // ......................*............................................. + aesr v4.16b, v23.16b // ......................*............................................. + pmull2 v31.1q, v13.2d, v11.2d // .......................*............................................ + ldr q12, [x6, #16] // ........................*........................................... + aesr v4.16b, v24.16b // ........................*........................................... + eor v31.16B, v9.16B, v31.16B // .........................*.......................................... + aesr v5.16b, v23.16b // .........................*.......................................... + eor v9.8B, v30.8B, v1.8B // ..........................*......................................... + aesr v4.16b, v25.16b // ..........................*......................................... + aesr v5.16b, v24.16b // ...........................*........................................ + eor v31.16B, v31.16B, v16.16B // ...........................*........................................ + aesr v4.16b, v26.16b // ............................*....................................... + ext v30.16B, v13.16B, v13.16B, #8 // ............................*....................................... + aesr v5.16b, v25.16b // .............................*...................................... + ext v16.16B, v31.16B, v31.16B, #8 // .............................*...................................... + eor v30.16B, v30.16B, v13.16B // ..............................*..................................... + aese v4.16b, v27.16b // ..............................*..................................... + aesr v5.16b, v26.16b // ...............................*.................................... + aesr v14.16b, v18.16b // ................................*................................... + pmull v2.1q, v10.1d, v2.1d // .................................*.................................. + aesr v14.16b, v19.16b // ..................................*................................. + ldr q10, [sp, #STACK_BASE_AES_ST + 16] // ..................................*................................. // @slothy:reads=stack_1 + pmull v11.1q, v13.1d, v11.1d // ...................................*................................ + eor v17.16B, v17.16B, v2.16B // ...................................*................................ + aesr v14.16b, v20.16b // ....................................*............................... + ldr q2, [sp, #STACK_BASE_AES_ST + 48] // ....................................*............................... // @slothy:reads=stack_3 + str w19, [sp, #STACK_BASE_AES_ST + 28] // ....................................*............................... // @slothy:writes=stack_1 + pmull v13.1q, v1.1d, v8.1d // .....................................*.............................. + eor v1.16B, v4.16B, v28.16B // .....................................*.............................. + eor v4.16B, v17.16B, v11.16B // ......................................*............................. + aesr v10.16b, v18.16b // ......................................*............................. + eor v17.16B, v1.16B, v29.16B // .......................................*............................ + aese v5.16b, v27.16b // .......................................*............................ + aesr v14.16b, v21.16b // ........................................*........................... + ldr q8, [x6, #32] // ........................................*........................... + eor v1.16B, v4.16B, v13.16B // .........................................*.......................... + aesr v10.16b, v19.16b // .........................................*.......................... + str q17, [x2, #32] // ..........................................*......................... + aesr v14.16b, v22.16b // ..........................................*......................... + ldr q17, [x0, #16] // ...........................................*........................ + aesr v10.16b, v20.16b // ...........................................*........................ + ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ............................................*....................... // @slothy:reads=stack_1 + aesr v2.16b, v18.16b // ............................................*....................... + aesr v10.16b, v21.16b // .............................................*...................... + pmull2 v13.1q, v30.2d, v12.2d // ..............................................*..................... + eor v11.16B, v1.16B, v31.16B // ...............................................*.................... + aesr v10.16b, v22.16b // ...............................................*.................... + eor v29.16B, v3.16B, v13.16B // ................................................*................... + pmull v9.1q, v9.1d, v12.1d // ................................................*................... + pmull v30.1q, v31.1d, v7.1d // .................................................*.................. + aesr v2.16b, v19.16b // ..................................................*................. + eor v31.16B, v29.16B, v9.16B // ..................................................*................. + aesr v10.16b, v23.16b // ...................................................*................ + eor v16.16B, v16.16B, v30.16B // ...................................................*................ + aesr v2.16b, v20.16b // ....................................................*............... + eor v3.16B, v31.16B, v11.16B // ....................................................*............... + aesr v10.16b, v24.16b // .....................................................*.............. + eor v30.16B, v3.16B, v16.16B // ......................................................*............. + aesr v2.16b, v21.16b // ......................................................*............. + aesr v10.16b, v25.16b // .......................................................*............ + ext v16.16B, v30.16B, v30.16B, #8 // ........................................................*........... + aesr v2.16b, v22.16b // ........................................................*........... + aesr v10.16b, v26.16b // .........................................................*.......... + aesr v2.16b, v23.16b // ..........................................................*......... + aese v10.16b, v27.16b // ...........................................................*........ + aesr v4.16b, v18.16b // ............................................................*....... + eor v3.16B, v10.16B, v28.16B // .............................................................*...... + aesr v2.16b, v24.16b // .............................................................*...... + aesr v4.16b, v19.16b // ..............................................................*..... + eor v5.16B, v5.16B, v28.16B // ..............................................................*..... + eor v11.16B, v3.16B, v6.16B // ...............................................................*.... + aesr v2.16b, v25.16b // ...............................................................*.... + aesr v4.16b, v20.16b // ................................................................*... + eor v5.16B, v5.16B, v15.16B // ................................................................*... + str q11, [x2, #16] // .................................................................*.. + aesr v2.16b, v26.16b // .................................................................*.. + str q5, [x2], #(4*16) // ..................................................................*. + pmull v11.1q, v30.1d, v7.1d // ..................................................................*. + aese v2.16b, v27.16b // ...................................................................* + rev64 v29.16B, v17.16B // ...................................................................* + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // rev w23, w13 // *................................................................... + // add w26, w13, #1 // *................................................................... + // rev w24, w26 // ..............*..................................................... + // add w21, w26, #1 // .*.................................................................. + // add w14, w21, #1 // ..*................................................................. + // str w24, [sp, #STACK_BASE_AES_ST + 28] // ................*................................................... + // add w13, w14, #1 // ...*................................................................ + // rev w21, w21 // ..*................................................................. + // str w21, [sp, #STACK_BASE_AES_ST + 44] // ...*................................................................ + // str w23, [sp, #STACK_BASE_AES_ST + 12] // .*.................................................................. + // ldr q14, [sp, #STACK_BASE_AES_ST + 32] // .....*.............................................................. + // aesr v14.16b, v18.16b // ............*....................................................... + // aesr v14.16b, v19.16b // ..............*..................................................... + // aesr v14.16b, v20.16b // ................*................................................... + // ldr q17, [x0, #16] // *................................................................... + // aesr v14.16b, v21.16b // ..................*................................................. + // ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ..................................*................................. + // aesr v14.16b, v22.16b // ....................*............................................... + // aesr v4.16b, v18.16b // ......................................*............................. + // aesr v4.16b, v19.16b // .........................................*.......................... + // rev64 v29.16B, v17.16B // ....*............................................................... + // aesr v4.16b, v20.16b // ...........................................*........................ + // ldr q8, [x6, #32] // ..............*..................................................... + // ldr q13, [sp, #STACK_BASE_AES_ST] // ...*................................................................ + // aesr v14.16b, v23.16b // ......................*............................................. + // rev w23, w13 // .......*............................................................ + // add w26, w13, #1 // ....*............................................................... + // aesr v4.16b, v21.16b // .............................................*...................... + // rev w24, w26 // .....*.............................................................. + // rev w19, w14 // ...*................................................................ + // add w21, w26, #1 // .....*.............................................................. + // ldr q6, [x6, #48] // ..*................................................................. + // aesr v14.16b, v24.16b // ........................*........................................... + // add w14, w21, #1 // ........*........................................................... + // str w24, [sp, #STACK_BASE_AES_ST + 28] // ....................................*............................... + // ldr q5, [x6, #64] // ....*............................................................... + // aesr v4.16b, v22.16b // ...............................................*.................... + // add w13, w14, #1 // .........*.......................................................... + // rev w21, w21 // ......*............................................................. + // ldr q9, [x6, #80] // ............*....................................................... + // aesr v13.16b, v18.16b // .......*............................................................ + // str w19, [sp, #STACK_BASE_AES_ST + 60] // ....*............................................................... + // aesr v4.16b, v23.16b // ...................................................*................ + // mov d30, v29.d[1] // ......*............................................................. + // str w21, [sp, #STACK_BASE_AES_ST + 44] // .......*............................................................ + // str w23, [sp, #STACK_BASE_AES_ST + 12] // ........*........................................................... + // aesr v13.16b, v19.16b // .........*.......................................................... + // aesr v14.16b, v25.16b // ..........................*......................................... + // ldr q15, [x0, #32] // *................................................................... + // aesr v13.16b, v20.16b // ...........*........................................................ + // aesr v14.16b, v26.16b // ............................*....................................... + // eor v31.8B, v30.8B, v29.8B // ........*........................................................... + // aesr v13.16b, v21.16b // .............*...................................................... + // rev64 v30.16B, v15.16B // ..........*......................................................... + // aese v14.16b, v27.16b // ..............................*..................................... + // aesr v13.16b, v22.16b // ...............*.................................................... + // eor v2.16B, v14.16B, v28.16B // .....................................*.............................. + // pmull v12.1q, v29.1d, v6.1d // .................................*.................................. + // ldr q14, [sp, #STACK_BASE_AES_ST + 32] // .....................*.............................................. + // aesr v13.16b, v23.16b // .........................*.......................................... + // eor v15.16B, v2.16B, v15.16B // .......................................*............................ + // pmull2 v1.1q, v29.2d, v6.2d // ......*............................................................. + // aesr v13.16b, v24.16b // ...........................*........................................ + // pmull v3.1q, v31.1d, v5.1d // ..........*......................................................... + // ldr q31, [x0], #(4*16) // .......*............................................................ + // aesr v13.16b, v25.16b // .............................*...................................... + // ldr q2, [sp, #STACK_BASE_AES_ST + 48] // ....................................*............................... + // aesr v4.16b, v24.16b // .....................................................*.............. + // ext v6.16B, v30.16B, v30.16B, #8 // ............................*....................................... + // aesr v13.16b, v26.16b // ...............................*.................................... + // aesr v4.16b, v25.16b // .......................................................*............ + // str q15, [x2, #32] // ..........................................*......................... + // eor v16.16B, v6.16B, v30.16B // ..............................*..................................... + // aese v13.16b, v27.16b // .......................................*............................ + // aesr v4.16b, v26.16b // .........................................................*.......... + // rev64 v15.16B, v31.16B // ...........*........................................................ + // eor v10.16B, v13.16B, v28.16B // ..............................................................*..... + // aesr v2.16b, v18.16b // ............................................*....................... + // aese v4.16b, v27.16b // ...........................................................*........ + // eor v29.16B, v15.16B, v11.16B // .............*...................................................... + // aesr v14.16b, v18.16b // ................................*................................... + // eor v31.16B, v10.16B, v31.16B // ................................................................*... + // eor v6.16B, v4.16B, v28.16B // .............................................................*...... + // pmull2 v4.1q, v30.2d, v8.2d // .......................*............................................ + // aesr v14.16b, v19.16b // ..................................*................................. + // ext v0.16B, v29.16B, v29.16B, #8 // ...............*.................................................... + // eor v10.16B, v6.16B, v17.16B // ...............................................................*.... + // pmull v8.1q, v30.1d, v8.1d // ...................................*................................ + // pmull v17.1q, v29.1d, v9.1d // .....................*.............................................. + // str q31, [x2], #(4*16) // ..................................................................*. + // str q10, [x2, #-48] // .................................................................*.. + // ldr q10, [x6, #16] // ........................*........................................... + // aesr v2.16b, v19.16b // ..................................................*................. + // eor v11.16B, v17.16B, v12.16B // ...................................*................................ + // eor v17.16B, v0.16B, v29.16B // .................*.................................................. + // pmull2 v6.1q, v29.2d, v9.2d // .................*.................................................. + // aesr v2.16b, v20.16b // ....................................................*............... + // ldr q0, [x0, #-16] // .*.................................................................. + // pmull2 v15.1q, v17.2d, v5.2d // ...................*................................................ + // eor v1.16B, v6.16B, v1.16B // ....................*............................................... + // aesr v14.16b, v20.16b // ....................................*............................... + // eor v6.16B, v11.16B, v8.16B // ......................................*............................. + // eor v29.16B, v15.16B, v3.16B // ......................*............................................. + // pmull2 v15.1q, v16.2d, v10.2d // ..............................................*..................... + // ldr q8, [x6] // ..*................................................................. + // aesr v2.16b, v21.16b // ......................................................*............. + // rev64 v31.16B, v0.16B // .....*.............................................................. + // eor v29.16B, v29.16B, v15.16B // ................................................*................... + // aesr v2.16b, v22.16b // ........................................................*........... + // ldr q17, [x0, #16] // ...........................................*........................ + // mov d16, v31.d[1] // ...................*................................................ + // aesr v14.16b, v21.16b // ........................................*........................... + // eor v1.16B, v1.16B, v4.16B // .........................*.......................................... + // pmull2 v11.1q, v31.2d, v8.2d // ........*........................................................... + // pmull v15.1q, v31.1d, v8.1d // .....................................*.............................. + // ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ............................................*....................... + // aesr v2.16b, v23.16b // ..........................................................*......... + // eor v8.8B, v16.8B, v31.8B // ..........................*......................................... + // aesr v14.16b, v22.16b // ..........................................*......................... + // eor v16.16B, v1.16B, v11.16B // ...........................*........................................ + // pmull v8.1q, v8.1d, v10.1d // ................................................*................... + // eor v1.16B, v6.16B, v15.16B // .........................................*.......................... + // ext v15.16B, v16.16B, v16.16B, #8 // .............................*...................................... + // pmull v12.1q, v16.1d, v7.1d // .................................................*.................. + // aesr v2.16b, v24.16b // .............................................................*...... + // eor v31.16B, v1.16B, v16.16B // ...............................................*.................... + // aesr v4.16b, v18.16b // ............................................................*....... + // eor v13.16B, v29.16B, v8.16B // ..................................................*................. + // aesr v2.16b, v25.16b // ...............................................................*.... + // eor v16.16B, v15.16B, v12.16B // ...................................................*................ + // aesr v4.16b, v19.16b // ..............................................................*..... + // eor v9.16B, v13.16B, v31.16B // ....................................................*............... + // aesr v2.16b, v26.16b // .................................................................*.. + // rev64 v29.16B, v17.16B // ...................................................................* + // eor v13.16B, v9.16B, v16.16B // ......................................................*............. + // aesr v4.16b, v20.16b // ................................................................*... + // aese v2.16b, v27.16b // ...................................................................* + // ldr q8, [x6, #32] // ........................................*........................... + // pmull v11.1q, v13.1d, v7.1d // ..................................................................*. + // ext v16.16B, v13.16B, v13.16B, #8 // ........................................................*........... + + sub count, count, #2 +cbz count, Lloop_unrolled_start_postamble +Lloop_unrolled_start: + // Instructions: 123 + // Expected cycles: 56 + // Expected IPC: 2.20 + // + // Cycle bound: 54.0 + // IPC bound: 2.28 + // + // Wall time: 3601.77s + // User time: 3601.77s + // + // ------------------ cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|----- + ldr q13, [sp, #STACK_BASE_AES_ST] // *....................................................... // @slothy:reads=stack_0 + aesr v14.16b, v23.16b // *....................................................... + rev w23, w13 // e....................................................... + add w26, w13, #1 // e....................................................... + aesr v4.16b, v21.16b // .*...................................................... + rev w24, w26 // .e...................................................... + rev w19, w14 // .*...................................................... + add w21, w26, #1 // .e...................................................... + ldr q6, [x6, #48] // ..*..................................................... + aesr v14.16b, v24.16b // ..*..................................................... + add w14, w21, #1 // ..e..................................................... + str w24, [sp, #STACK_BASE_AES_ST + 28] // ..e..................................................... // @slothy:writes=stack_1 + ldr q5, [x6, #64] // ...*.................................................... + aesr v4.16b, v22.16b // ...*.................................................... + add w13, w14, #1 // ...e.................................................... + rev w21, w21 // ...e.................................................... + ldr q9, [x6, #80] // ....*................................................... + aesr v13.16b, v18.16b // ....*................................................... + str w19, [sp, #STACK_BASE_AES_ST + 60] // ....*................................................... // @slothy:writes=stack_3 + aesr v4.16b, v23.16b // .....*.................................................. + mov d30, v29.d[1] // .....*.................................................. + str w21, [sp, #STACK_BASE_AES_ST + 44] // .....e.................................................. // @slothy:writes=stack_2 + str w23, [sp, #STACK_BASE_AES_ST + 12] // .....e.................................................. // @slothy:writes=stack_0 + aesr v13.16b, v19.16b // ......*................................................. + eor v10.16B, v1.16B, v11.16B // ......l................................................. + aesr v14.16b, v25.16b // .......*................................................ + ldr q15, [x0, #32] // .......*................................................ + aesr v13.16b, v20.16b // ........*............................................... + eor v11.16B, v10.16B, v16.16B // ........l............................................... + aesr v14.16b, v26.16b // .........*.............................................. + eor v31.8B, v30.8B, v29.8B // .........*.............................................. + aesr v13.16b, v21.16b // ..........*............................................. + ext v11.16B, v11.16B, v11.16B, #8 // ..........l............................................. + rev64 v30.16B, v15.16B // ...........*............................................ + aese v14.16b, v27.16b // ...........*............................................ + aesr v13.16b, v22.16b // ............*........................................... + eor v10.16B, v2.16B, v28.16B // ............l........................................... + eor v2.16B, v14.16B, v28.16B // .............*.......................................... + pmull v12.1q, v29.1d, v6.1d // .............*.......................................... + ldr q14, [sp, #STACK_BASE_AES_ST + 32] // ..............e......................................... // @slothy:reads=stack_2 + aesr v13.16b, v23.16b // ..............*......................................... + eor v15.16B, v2.16B, v15.16B // ...............*........................................ + pmull2 v1.1q, v29.2d, v6.2d // ...............*........................................ + eor v0.16B, v10.16B, v0.16B // ................l....................................... + aesr v13.16b, v24.16b // ................*....................................... + pmull v3.1q, v31.1d, v5.1d // .................*...................................... + ldr q31, [x0], #(4*16) // .................*...................................... + aesr v13.16b, v25.16b // ..................*..................................... + ldr q2, [sp, #STACK_BASE_AES_ST + 48] // ..................*..................................... // @slothy:reads=stack_3 + aesr v4.16b, v24.16b // ...................*.................................... + str q0, [x2, #-16] // ...................l.................................... + ext v6.16B, v30.16B, v30.16B, #8 // ....................*................................... + aesr v13.16b, v26.16b // ....................*................................... + aesr v4.16b, v25.16b // .....................*.................................. + str q15, [x2, #32] // .....................*.................................. + eor v16.16B, v6.16B, v30.16B // ......................*................................. + aese v13.16b, v27.16b // ......................*................................. + aesr v4.16b, v26.16b // .......................*................................ + rev64 v15.16B, v31.16B // .......................*................................ + eor v10.16B, v13.16B, v28.16B // ........................*............................... + aesr v2.16b, v18.16b // ........................*............................... + aese v4.16b, v27.16b // .........................*.............................. + eor v29.16B, v15.16B, v11.16B // .........................*.............................. + aesr v14.16b, v18.16b // ..........................e............................. + eor v31.16B, v10.16B, v31.16B // ..........................*............................. + eor v6.16B, v4.16B, v28.16B // ...........................*............................ + pmull2 v4.1q, v30.2d, v8.2d // ...........................*............................ + aesr v14.16b, v19.16b // ............................e........................... + ext v0.16B, v29.16B, v29.16B, #8 // ............................*........................... + eor v10.16B, v6.16B, v17.16B // .............................*.......................... + pmull v8.1q, v30.1d, v8.1d // .............................*.......................... + pmull v17.1q, v29.1d, v9.1d // ..............................*......................... + str q31, [x2], #(4*16) // ..............................*......................... + str q10, [x2, #-48] // ...............................*........................ + ldr q10, [x6, #16] // ...............................*........................ + aesr v2.16b, v19.16b // ................................*....................... + eor v11.16B, v17.16B, v12.16B // ................................*....................... + eor v17.16B, v0.16B, v29.16B // .................................*...................... + pmull2 v6.1q, v29.2d, v9.2d // .................................*...................... + aesr v2.16b, v20.16b // ..................................*..................... + ldr q0, [x0, #-16] // ..................................*..................... + pmull2 v15.1q, v17.2d, v5.2d // ...................................*.................... + eor v1.16B, v6.16B, v1.16B // ...................................*.................... + aesr v14.16b, v20.16b // ....................................e................... + eor v6.16B, v11.16B, v8.16B // ....................................*................... + eor v29.16B, v15.16B, v3.16B // .....................................*.................. + pmull2 v15.1q, v16.2d, v10.2d // .....................................*.................. + ldr q8, [x6] // ......................................*................. + aesr v2.16b, v21.16b // ......................................*................. + rev64 v31.16B, v0.16B // .......................................*................ + eor v29.16B, v29.16B, v15.16B // .......................................*................ + aesr v2.16b, v22.16b // ........................................*............... + ldr q17, [x0, #16] // ........................................e............... + mov d16, v31.d[1] // .........................................*.............. + aesr v14.16b, v21.16b // .........................................e.............. + eor v1.16B, v1.16B, v4.16B // ..........................................*............. + pmull2 v11.1q, v31.2d, v8.2d // ..........................................*............. + pmull v15.1q, v31.1d, v8.1d // ...........................................*............ + ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ...........................................e............ // @slothy:reads=stack_1 + aesr v2.16b, v23.16b // ............................................*........... + eor v8.8B, v16.8B, v31.8B // ............................................*........... + aesr v14.16b, v22.16b // .............................................e.......... + eor v16.16B, v1.16B, v11.16B // .............................................*.......... + pmull v8.1q, v8.1d, v10.1d // ..............................................*......... + eor v1.16B, v6.16B, v15.16B // ..............................................*......... + ext v15.16B, v16.16B, v16.16B, #8 // ...............................................*........ + pmull v12.1q, v16.1d, v7.1d // ...............................................*........ + aesr v2.16b, v24.16b // ................................................*....... + eor v31.16B, v1.16B, v16.16B // ................................................*....... + aesr v4.16b, v18.16b // .................................................e...... + eor v13.16B, v29.16B, v8.16B // .................................................*...... + aesr v2.16b, v25.16b // ..................................................*..... + eor v16.16B, v15.16B, v12.16B // ..................................................*..... + aesr v4.16b, v19.16b // ...................................................e.... + eor v9.16B, v13.16B, v31.16B // ...................................................*.... + aesr v2.16b, v26.16b // ....................................................*... + rev64 v29.16B, v17.16B // ....................................................e... + eor v13.16B, v9.16B, v16.16B // .....................................................*.. + aesr v4.16b, v20.16b // .....................................................e.. + aese v2.16b, v27.16b // ......................................................*. + ldr q8, [x6, #32] // ......................................................e. + pmull v11.1q, v13.1d, v7.1d // .......................................................* + ext v16.16B, v13.16B, v13.16B, #8 // .......................................................* + + // -------------------------------------------------------- cycle (expected) ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q29, [x0], #(4*16) // .................~......................................'................*......................................'................~.. + // rev w14, w13 // e.......................................................~.......................................................~................... + // str w14, [sp, #(STACK_BASE_AES_ST + 0*16 + 12)] // .....e..................................................'....~..................................................'....~.............. + // ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // ~.......................................................*.......................................................~................... + // add w13, w13, #1 // e.......................................................~.......................................................~................... + // aesr v0.16b, v18.16b // ....~...................................................'...*...................................................'...~............... + // aesr v0.16b, v19.16b // ......~.................................................'.....*.................................................'.....~............. + // aesr v0.16b, v20.16b // ........~...............................................'.......*...............................................'.......~........... + // aesr v0.16b, v21.16b // ..........~.............................................'.........*.............................................'.........~......... + // aesr v0.16b, v22.16b // ............~...........................................'...........*...........................................'...........~....... + // aesr v0.16b, v23.16b // ..............~.........................................'.............*.........................................'.............~..... + // aesr v0.16b, v24.16b // ................~.......................................'...............*.......................................'...............~... + // aesr v0.16b, v25.16b // ..................~.....................................'.................*.....................................'.................~. + // aesr v0.16b, v26.16b // ....................~...................................'...................*...................................'................... + // aese v0.16b, v27.16b // ......................~.................................'.....................*.................................'................... + // eor v0.16b, v0.16b, v28.16b // ........................~...............................'.......................*...............................'................... + // eor v0.16b, v0.16b, v29.16b // ..........................~.............................'.........................*.............................'................... + // str q0, [x2], #(4*16) // ..............................~.........................'.............................*.........................'................... + // ldr q12, [x6, #48] // ..~.....................................................'.*.....................................................'.~................. + // ldr q13, [x6, #80] // ....~...................................................'...*...................................................'...~............... + // ldr q14, [x6, #64] // ...~....................................................'..*....................................................'..~................ + // rev64 v29.16b, v29.16b // .......................~................................'......................*................................'................... + // eor v29.16b, v29.16b, v11.16b // .........................~..............................'........................*..............................'................... + // pmull v8.1q, v29.1d, v13.1d // ..............................~.........................'.............................*.........................'................... + // pmull2 v9.1q, v29.2d, v13.2d // .................................~......................'................................*......................'................... + // ext v11.16b, v29.16b, v29.16b, #8 // ............................~...........................'...........................*...........................'................... + // eor v11.16b, v11.16b, v29.16b // .................................~......................'................................*......................'................... + // pmull2 v10.1q, v11.2d, v14.2d // ...................................~....................'..................................*....................'................... + // ldr q29, [x0, #(-3*16)] // ........................................e...............'.......................................~...............'................... + // rev w14, w13 // .e......................................................'~......................................................'~.................. + // str w14, [sp, #(STACK_BASE_AES_ST + 1*16 + 12)] // ..e.....................................................'.~.....................................................'.~................. + // ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // ...........................................e............'..........................................~............'................... + // add w13, w13, #1 // .e......................................................'~......................................................'~.................. + // aesr v0.16b, v18.16b // .................................................e......'................................................~......'................... + // aesr v0.16b, v19.16b // ...................................................e....'..................................................~....'................... + // aesr v0.16b, v20.16b // .....................................................e..'....................................................~..'................... + // aesr v0.16b, v21.16b // .~......................................................'*......................................................'~.................. + // aesr v0.16b, v22.16b // ...~....................................................'..*....................................................'..~................ + // aesr v0.16b, v23.16b // .....~..................................................'....*..................................................'....~.............. + // aesr v0.16b, v24.16b // ...................~....................................'..................*....................................'................... + // aesr v0.16b, v25.16b // .....................~..................................'....................*..................................'................... + // aesr v0.16b, v26.16b // .......................~................................'......................*................................'................... + // aese v0.16b, v27.16b // .........................~..............................'........................*..............................'................... + // eor v0.16b, v0.16b, v28.16b // ...........................~............................'..........................*............................'................... + // eor v0.16b, v0.16b, v29.16b // .............................~..........................'............................*..........................'................... + // str q0, [x2, #(-3*16)] // ...............................~........................'..............................*........................'................... + // rev64 v29.16b, v29.16b // ....................................................e...'...................................................~...'................... + // pmull v11.1q, v29.1d, v12.1d // .............~..........................................'............*..........................................'............~...... + // eor v8.16b, v8.16b, v11.16b // ................................~.......................'...............................*.......................'................... + // pmull2 v11.1q, v29.2d, v12.2d // ...............~........................................'..............*........................................'..............~.... + // eor v9.16b, v9.16b, v11.16b // ...................................~....................'..................................*....................'................... + // mov d11, v29.d[1] // .....~..................................................'....*..................................................'....~.............. + // eor v11.8b, v11.8b, v29.8b // .........~..............................................'........*..............................................'........~.......... + // pmull v11.1q, v11.1d, v14.1d // .................~......................................'................*......................................'................~.. + // eor v10.16b, v10.16b, v11.16b // .....................................~..................'....................................*..................'................... + // ldr q29, [x0, #(-2*16)] // .......~................................................'......*................................................'......~............ + // rev w14, w13 // ...e....................................................'..~....................................................'..~................ + // str w14, [sp, #(STACK_BASE_AES_ST + 2*16 + 12)] // .....e..................................................'....~..................................................'....~.............. + // ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // ..............e.........................................'.............~.........................................'.............~..... + // add w13, w13, #1 // ..e.....................................................'.~.....................................................'.~................. + // aesr v0.16b, v18.16b // ..........................e.............................'.........................~.............................'................... + // aesr v0.16b, v19.16b // ............................e...........................'...........................~...........................'................... + // aesr v0.16b, v20.16b // ....................................e...................'...................................~...................'................... + // aesr v0.16b, v21.16b // .........................................e..............'........................................~..............'................... + // aesr v0.16b, v22.16b // .............................................e..........'............................................~..........'................... + // aesr v0.16b, v23.16b // ~.......................................................*.......................................................~................... + // aesr v0.16b, v24.16b // ..~.....................................................'.*.....................................................'.~................. + // aesr v0.16b, v25.16b // .......~................................................'......*................................................'......~............ + // aesr v0.16b, v26.16b // .........~..............................................'........*..............................................'........~.......... + // aese v0.16b, v27.16b // ...........~............................................'..........*............................................'..........~........ + // eor v0.16b, v0.16b, v28.16b // .............~..........................................'............*..........................................'............~...... + // eor v0.16b, v0.16b, v29.16b // ...............~........................................'..............*........................................'..............~.... + // str q0, [x2, #(-2*16)] // .....................~..................................'....................*..................................'................... + // ldr q12, [x6] // ......................................~.................'.....................................*.................'................... + // ldr q13, [x6, #32] // ......................................................e.'.....................................................~.'................... + // ldr q14, [x6, #16] // ...............................~........................'..............................*........................'................... + // rev64 v29.16b, v29.16b // ...........~............................................'..........*............................................'..........~........ + // pmull v11.1q, v29.1d, v13.1d // .............................~..........................'............................*..........................'................... + // eor v8.16b, v8.16b, v11.16b // ....................................~...................'...................................*...................'................... + // pmull2 v11.1q, v29.2d, v13.2d // ...........................~............................'..........................*............................'................... + // eor v9.16b, v9.16b, v11.16b // ..........................................~.............'.........................................*.............'................... + // ext v11.16b, v29.16b, v29.16b, #8 // ....................~...................................'...................*...................................'................... + // eor v11.16b, v11.16b, v29.16b // ......................~.................................'.....................*.................................'................... + // pmull2 v11.1q, v11.2d, v14.2d // .....................................~..................'....................................*..................'................... + // eor v10.16b, v10.16b, v11.16b // .......................................~................'......................................*................'................... + // ldr q29, [x0, #(-1*16)] // ..................................~.....................'.................................*.....................'................... + // rev w14, w13 // .~......................................................'*......................................................'~.................. + // str w14, [sp, #(STACK_BASE_AES_ST + 3*16 + 12)] // ....~...................................................'...*...................................................'...~............... + // ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // ..................~.....................................'.................*.....................................'.................~. + // add w13, w13, #1 // ...e....................................................'..~....................................................'..~................ + // aesr v0.16b, v18.16b // ........................~...............................'.......................*...............................'................... + // aesr v0.16b, v19.16b // ................................~.......................'...............................*.......................'................... + // aesr v0.16b, v20.16b // ..................................~.....................'.................................*.....................'................... + // aesr v0.16b, v21.16b // ......................................~.................'.....................................*.................'................... + // aesr v0.16b, v22.16b // ........................................~...............'.......................................*...............'................... + // aesr v0.16b, v23.16b // ............................................~...........'...........................................*...........'................... + // aesr v0.16b, v24.16b // ................................................~.......'...............................................*.......'................... + // aesr v0.16b, v25.16b // ..................................................~.....'.................................................*.....'................... + // aesr v0.16b, v26.16b // ....................................................~...'...................................................*...'................... + // aese v0.16b, v27.16b // ......................................................~.'.....................................................*.'................... + // eor v0.16b, v0.16b, v28.16b // ............~...........................................'...........~...........................................'...........l....... + // eor v0.16b, v0.16b, v29.16b // ................~.......................................'...............~.......................................'...............l... + // str q0, [x2, #(-1*16)] // ...................~....................................'..................~....................................'..................l + // rev64 v29.16b, v29.16b // .......................................~................'......................................*................'................... + // pmull v11.1q, v29.1d, v12.1d // ...........................................~............'..........................................*............'................... + // eor v8.16b, v8.16b, v11.16b // ..............................................~.........'.............................................*.........'................... + // pmull2 v11.1q, v29.2d, v12.2d // ..........................................~.............'.........................................*.............'................... + // eor v9.16b, v9.16b, v11.16b // .............................................~..........'............................................*..........'................... + // mov d11, v29.d[1] // .........................................~..............'........................................*..............'................... + // eor v11.8b, v11.8b, v29.8b // ............................................~...........'...........................................*...........'................... + // pmull v11.1q, v11.1d, v14.1d // ..............................................~.........'.............................................*.........'................... + // eor v10.16b, v10.16b, v11.16b // .................................................~......'................................................*......'................... + // eor v0.16b, v8.16b, v9.16b // ................................................~.......'...............................................*.......'................... + // pmull v1.1q, v9.1d, v7.1d // ...............................................~........'..............................................*........'................... + // ext v9.16b, v9.16b, v9.16b, #8 // ...............................................~........'..............................................*........'................... + // eor v10.16b, v10.16b, v0.16b // ...................................................~....'..................................................*....'................... + // eor v1.16b, v9.16b, v1.16b // ..................................................~.....'.................................................*.....'................... + // eor v10.16b, v10.16b, v1.16b // .....................................................~..'....................................................*..'................... + // pmull v9.1q, v10.1d, v7.1d // .......................................................~'......................................................*'................... + // eor v8.16b, v8.16b, v9.16b // ......~.................................................'.....~.................................................'.....l............. + // ext v10.16b, v10.16b, v10.16b, #8 // .......................................................~'......................................................*'................... + // eor v11.16b, v8.16b, v10.16b // ........~...............................................'.......~...............................................'.......l........... + // ext v11.16b, v11.16b, v11.16b, #8 // ..........~.............................................'.........~.............................................'.........l......... + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_start_postamble:// end of loop kernel + // Instructions: 106 + // Expected cycles: 52 + // Expected IPC: 2.04 + // + // Cycle bound: 52.0 + // IPC bound: 2.04 + // + // Wall time: 19.98s + // User time: 19.98s + // + // ---------------- cycle (expected) -----------------> + // 0 25 50 + // |------------------------|------------------------|- + ldr q3, [x6, #80] // *................................................... + aesr v4.16b, v21.16b // *................................................... + rev w7, w14 // *................................................... + eor v12.16B, v1.16B, v11.16B // .*.................................................. + aesr v14.16b, v23.16b // .*.................................................. + str w7, [sp, #STACK_BASE_AES_ST + 60] // .*.................................................. // @slothy:writes=stack_3 + mov d15, v29.d[1] // ..*................................................. + aesr v4.16b, v22.16b // ..*................................................. + eor v16.16B, v12.16B, v16.16B // ...*................................................ + aesr v14.16b, v24.16b // ...*................................................ + aesr v4.16b, v23.16b // ....*............................................... + ldr q12, [x0], #(4*16) // ....*............................................... + aesr v14.16b, v25.16b // .....*.............................................. + ext v13.16B, v16.16B, v16.16B, #8 // .....*.............................................. + ldr q30, [x6, #64] // ......*............................................. + aesr v4.16b, v24.16b // ......*............................................. + eor v16.8B, v15.8B, v29.8B // .......*............................................ + aesr v14.16b, v26.16b // .......*............................................ + aesr v4.16b, v25.16b // ........*........................................... + rev64 v11.16B, v12.16B // ........*........................................... + ldr q5, [x6, #16] // .........*.......................................... + aese v14.16b, v27.16b // .........*.......................................... + aesr v4.16b, v26.16b // ..........*......................................... + eor v1.16B, v11.16B, v13.16B // ..........*......................................... + pmull v15.1q, v16.1d, v30.1d // ...........*........................................ + ldr q31, [x6, #48] // ...........*........................................ + pmull v9.1q, v1.1d, v3.1d // ............*....................................... + ext v13.16B, v1.16B, v1.16B, #8 // ............*....................................... + pmull2 v6.1q, v1.2d, v3.2d // .............*...................................... + ldr q3, [sp, #STACK_BASE_AES_ST] // .............*...................................... // @slothy:reads=stack_0 + aese v4.16b, v27.16b // ..............*..................................... + eor v1.16B, v13.16B, v1.16B // ..............*..................................... + pmull2 v11.1q, v29.2d, v31.2d // ...............*.................................... + ldr q16, [sp, #STACK_BASE_AES_ST + 48] // ...............*.................................... // @slothy:reads=stack_3 + pmull2 v13.1q, v1.2d, v30.2d // ................*................................... + ldr q1, [x6] // ................*................................... + eor v6.16B, v6.16B, v11.16B // .................*.................................. + aesr v3.16b, v18.16b // .................*.................................. + ldr q11, [x0, #-32] // ..................*................................. + pmull v29.1q, v29.1d, v31.1d // ..................*................................. + ldr q10, [x0, #-16] // ...................*................................ + aesr v16.16b, v18.16b // ...................*................................ + eor v29.16B, v9.16B, v29.16B // ....................*............................... + aesr v3.16b, v19.16b // ....................*............................... + eor v2.16B, v2.16B, v28.16B // .....................*.............................. + aesr v16.16b, v19.16b // .....................*.............................. + rev64 v30.16B, v11.16B // ......................*............................. + aesr v3.16b, v20.16b // ......................*............................. + aesr v16.16b, v20.16b // .......................*............................ + eor v2.16B, v2.16B, v0.16B // .......................*............................ + pmull2 v31.1q, v30.2d, v8.2d // ........................*........................... + rev64 v9.16B, v10.16B // ........................*........................... + aesr v16.16b, v21.16b // .........................*.......................... + ext v0.16B, v30.16B, v30.16B, #8 // .........................*.......................... + eor v31.16B, v6.16B, v31.16B // ..........................*......................... + pmull2 v6.1q, v9.2d, v1.2d // ..........................*......................... + aesr v16.16b, v22.16b // ...........................*........................ + eor v0.16B, v0.16B, v30.16B // ...........................*........................ + eor v6.16B, v31.16B, v6.16B // ............................*....................... + pmull v8.1q, v30.1d, v8.1d // ............................*....................... + aesr v16.16b, v23.16b // .............................*...................... + mov d30, v9.d[1] // .............................*...................... + ext v31.16B, v6.16B, v6.16B, #8 // ..............................*..................... + pmull v1.1q, v9.1d, v1.1d // ..............................*..................... + eor v13.16B, v13.16B, v15.16B // ...............................*.................... + aesr v3.16b, v21.16b // ...............................*.................... + eor v15.8B, v30.8B, v9.8B // ................................*................... + pmull2 v30.1q, v0.2d, v5.2d // ................................*................... + eor v9.16B, v29.16B, v8.16B // .................................*.................. + aesr v3.16b, v22.16b // .................................*.................. + eor v29.16B, v13.16B, v30.16B // ..................................*................. + pmull v30.1q, v15.1d, v5.1d // ..................................*................. + pmull v5.1q, v6.1d, v7.1d // ...................................*................ + eor v13.16B, v9.16B, v1.16B // ...................................*................ + eor v30.16B, v29.16B, v30.16B // ....................................*............... + aesr v3.16b, v23.16b // ....................................*............... + aesr v16.16b, v24.16b // .....................................*.............. + eor v1.16B, v13.16B, v6.16B // .....................................*.............. + eor v29.16B, v31.16B, v5.16B // ......................................*............. + aesr v3.16b, v24.16b // ......................................*............. + eor v0.16B, v30.16B, v1.16B // .......................................*............ + aesr v16.16b, v25.16b // .......................................*............ + eor v15.16B, v4.16B, v28.16B // ........................................*........... + aesr v3.16b, v25.16b // ........................................*........... + eor v1.16B, v0.16B, v29.16B // .........................................*.......... + aesr v16.16b, v26.16b // .........................................*.......... + eor v9.16B, v15.16B, v17.16B // ..........................................*......... + aesr v3.16b, v26.16b // ..........................................*......... + pmull v17.1q, v1.1d, v7.1d // ...........................................*........ + eor v14.16B, v14.16B, v28.16B // ...........................................*........ + str q9, [x2, #16] // ............................................*....... + aese v3.16b, v27.16b // ............................................*....... + ext v5.16B, v1.16B, v1.16B, #8 // .............................................*...... + aese v16.16b, v27.16b // .............................................*...... + eor v1.16B, v13.16B, v17.16B // ..............................................*..... + eor v8.16B, v3.16B, v28.16B // ..............................................*..... + eor v16.16B, v16.16B, v28.16B // ...............................................*.... + eor v15.16B, v14.16B, v11.16B // ...............................................*.... + eor v3.16B, v1.16B, v5.16B // ................................................*... + eor v17.16B, v8.16B, v12.16B // ................................................*... + eor v0.16B, v16.16B, v10.16B // .................................................*.. + str q15, [x2, #32] // .................................................*.. + ext v11.16B, v3.16B, v3.16B, #8 // ..................................................*. + str q17, [x2], #(4*16) // ..................................................*. + str q2, [x2, #-80] // ...................................................* + str q0, [x2, #-16] // ...................................................* + + // ---------------- cycle (expected) -----------------> + // 0 25 50 + // |------------------------|------------------------|- + // ldr q13, [sp, #STACK_BASE_AES_ST] // .............*...................................... + // aesr v14.16b, v23.16b // .*.................................................. + // aesr v4.16b, v21.16b // *................................................... + // rev w19, w14 // *................................................... + // ldr q6, [x6, #48] // ...........*........................................ + // aesr v14.16b, v24.16b // ...*................................................ + // ldr q5, [x6, #64] // ......*............................................. + // aesr v4.16b, v22.16b // ..*................................................. + // ldr q9, [x6, #80] // *................................................... + // aesr v13.16b, v18.16b // .................*.................................. + // str w19, [sp, #STACK_BASE_AES_ST + 60] // .*.................................................. + // aesr v4.16b, v23.16b // ....*............................................... + // mov d30, v29.d[1] // ..*................................................. + // aesr v13.16b, v19.16b // ....................*............................... + // eor v10.16B, v1.16B, v11.16B // .*.................................................. + // aesr v14.16b, v25.16b // .....*.............................................. + // ldr q15, [x0, #32] // ..................*................................. + // aesr v13.16b, v20.16b // ......................*............................. + // eor v11.16B, v10.16B, v16.16B // ...*................................................ + // aesr v14.16b, v26.16b // .......*............................................ + // eor v31.8B, v30.8B, v29.8B // .......*............................................ + // aesr v13.16b, v21.16b // ...............................*.................... + // ext v11.16B, v11.16B, v11.16B, #8 // .....*.............................................. + // rev64 v30.16B, v15.16B // ......................*............................. + // aese v14.16b, v27.16b // .........*.......................................... + // aesr v13.16b, v22.16b // .................................*.................. + // eor v10.16B, v2.16B, v28.16B // .....................*.............................. + // eor v2.16B, v14.16B, v28.16B // ...........................................*........ + // pmull v12.1q, v29.1d, v6.1d // ..................*................................. + // aesr v13.16b, v23.16b // ....................................*............... + // eor v15.16B, v2.16B, v15.16B // ...............................................*.... + // pmull2 v1.1q, v29.2d, v6.2d // ...............*.................................... + // eor v0.16B, v10.16B, v0.16B // .......................*............................ + // aesr v13.16b, v24.16b // ......................................*............. + // pmull v3.1q, v31.1d, v5.1d // ...........*........................................ + // ldr q31, [x0], #(4*16) // ....*............................................... + // aesr v13.16b, v25.16b // ........................................*........... + // ldr q2, [sp, #STACK_BASE_AES_ST + 48] // ...............*.................................... + // aesr v4.16b, v24.16b // ......*............................................. + // str q0, [x2, #-16] // ...................................................* + // ext v6.16B, v30.16B, v30.16B, #8 // .........................*.......................... + // aesr v13.16b, v26.16b // ..........................................*......... + // aesr v4.16b, v25.16b // ........*........................................... + // str q15, [x2, #32] // .................................................*.. + // eor v16.16B, v6.16B, v30.16B // ...........................*........................ + // aese v13.16b, v27.16b // ............................................*....... + // aesr v4.16b, v26.16b // ..........*......................................... + // rev64 v15.16B, v31.16B // ........*........................................... + // eor v10.16B, v13.16B, v28.16B // ..............................................*..... + // aesr v2.16b, v18.16b // ...................*................................ + // aese v4.16b, v27.16b // ..............*..................................... + // eor v29.16B, v15.16B, v11.16B // ..........*......................................... + // eor v31.16B, v10.16B, v31.16B // ................................................*... + // eor v6.16B, v4.16B, v28.16B // ........................................*........... + // pmull2 v4.1q, v30.2d, v8.2d // ........................*........................... + // ext v0.16B, v29.16B, v29.16B, #8 // ............*....................................... + // eor v10.16B, v6.16B, v17.16B // ..........................................*......... + // pmull v8.1q, v30.1d, v8.1d // ............................*....................... + // pmull v17.1q, v29.1d, v9.1d // ............*....................................... + // str q31, [x2], #(4*16) // ..................................................*. + // str q10, [x2, #-48] // ............................................*....... + // ldr q10, [x6, #16] // .........*.......................................... + // aesr v2.16b, v19.16b // .....................*.............................. + // eor v11.16B, v17.16B, v12.16B // ....................*............................... + // eor v17.16B, v0.16B, v29.16B // ..............*..................................... + // pmull2 v6.1q, v29.2d, v9.2d // .............*...................................... + // aesr v2.16b, v20.16b // .......................*............................ + // ldr q0, [x0, #-16] // ...................*................................ + // pmull2 v15.1q, v17.2d, v5.2d // ................*................................... + // eor v1.16B, v6.16B, v1.16B // .................*.................................. + // eor v6.16B, v11.16B, v8.16B // .................................*.................. + // eor v29.16B, v15.16B, v3.16B // ...............................*.................... + // pmull2 v15.1q, v16.2d, v10.2d // ................................*................... + // ldr q8, [x6] // ................*................................... + // aesr v2.16b, v21.16b // .........................*.......................... + // rev64 v31.16B, v0.16B // ........................*........................... + // eor v29.16B, v29.16B, v15.16B // ..................................*................. + // aesr v2.16b, v22.16b // ...........................*........................ + // mov d16, v31.d[1] // .............................*...................... + // eor v1.16B, v1.16B, v4.16B // ..........................*......................... + // pmull2 v11.1q, v31.2d, v8.2d // ..........................*......................... + // pmull v15.1q, v31.1d, v8.1d // ..............................*..................... + // aesr v2.16b, v23.16b // .............................*...................... + // eor v8.8B, v16.8B, v31.8B // ................................*................... + // eor v16.16B, v1.16B, v11.16B // ............................*....................... + // pmull v8.1q, v8.1d, v10.1d // ..................................*................. + // eor v1.16B, v6.16B, v15.16B // ...................................*................ + // ext v15.16B, v16.16B, v16.16B, #8 // ..............................*..................... + // pmull v12.1q, v16.1d, v7.1d // ...................................*................ + // aesr v2.16b, v24.16b // .....................................*.............. + // eor v31.16B, v1.16B, v16.16B // .....................................*.............. + // eor v13.16B, v29.16B, v8.16B // ....................................*............... + // aesr v2.16b, v25.16b // .......................................*............ + // eor v16.16B, v15.16B, v12.16B // ......................................*............. + // eor v9.16B, v13.16B, v31.16B // .......................................*............ + // aesr v2.16b, v26.16b // .........................................*.......... + // eor v13.16B, v9.16B, v16.16B // .........................................*.......... + // aese v2.16b, v27.16b // .............................................*...... + // pmull v11.1q, v13.1d, v7.1d // ...........................................*........ + // ext v16.16B, v13.16B, v13.16B, #8 // .............................................*...... + // eor v10.16B, v1.16B, v11.16B // ..............................................*..... + // eor v11.16B, v10.16B, v16.16B // ................................................*... + // ext v11.16B, v11.16B, v11.16B, #8 // ..................................................*. + // eor v10.16B, v2.16B, v28.16B // ...............................................*.... + // eor v0.16B, v10.16B, v0.16B // .................................................*.. + // str q0, [x2, #-16] // ...................................................* + + b Lloop_unrolled_start_end +Lloop_unrolled_start_iter_1: + + + ldr q29, [x0], #(4*16) + rev w14, w13 + str w14, [sp, #(STACK_BASE_AES_ST + 0*16 + 12)] // @slothy:writes=stack_0 + ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:reads=stack_0 + add w13, w13, #1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2], #(4*16) + + ldr q12, [x6, #48] + ldr q13, [x6, #80] + ldr q14, [x6, #64] + rev64 v29.16b, v29.16b + eor v29.16b, v29.16b, v11.16b + // Low product + pmull v8.1q, v29.1d, v13.1d + // High product + pmull2 v9.1q, v29.2d, v13.2d + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v10.1q, v11.2d, v14.2d + + ldr q29, [x0, #(-3*16)] + rev w14, w13 + str w14, [sp, #(STACK_BASE_AES_ST + 1*16 + 12)] // @slothy:writes=stack_1 + ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:reads=stack_1 + add w13, w13, #1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(-3*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v12.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v12.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v11.1q, v11.1d, v14.1d + eor v10.16b, v10.16b, v11.16b + + ldr q29, [x0, #(-2*16)] + rev w14, w13 + str w14, [sp, #(STACK_BASE_AES_ST + 2*16 + 12)] // @slothy:writes=stack_2 + ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:reads=stack_2 + add w13, w13, #1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(-2*16)] + + ldr q12, [x6] + ldr q13, [x6, #32] + ldr q14, [x6, #16] + rev64 v29.16b, v29.16b + + // Low product + pmull v11.1q, v29.1d, v13.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v13.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v11.1q, v11.2d, v14.2d + eor v10.16b, v10.16b, v11.16b + + ldr q29, [x0, #(-1*16)] + rev w14, w13 + str w14, [sp, #(STACK_BASE_AES_ST + 3*16 + 12)] // @slothy:writes=stack_3 + ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:reads=stack_3 + add w13, w13, #1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(-1*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v12.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v12.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v11.1q, v11.1d, v14.1d + eor v10.16b, v10.16b, v11.16b + + eor v0.16b, v8.16b, v9.16b + pmull v1.1q, v9.1d, v7.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v0.16b + eor v1.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v1.16b + pmull v9.1q, v10.1d, v7.1d + eor v8.16b, v8.16b, v9.16b + ext v10.16b, v10.16b, v10.16b, #8 + eor v11.16b, v8.16b, v10.16b + ext v11.16b, v11.16b, v11.16b, #8 + +Lloop_unrolled_start_iter_1_end: +Lloop_unrolled_start_end: +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + // rev32 rtmp_ctr.16b, rtmp_ctr.16b + // str rtmp_ctr_q, [ivec] + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https: // www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif \ No newline at end of file diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag.S new file mode 100644 index 00000000000..3d90f589c26 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag.S @@ -0,0 +1,1602 @@ +// Copyright (c) 2022, ARM Inc. + +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. + +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker + +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https: // github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . + +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https: // github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x9 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req Ht1q +Ht4q .req Ht2q +Ht34q .req Ht12q + +Ht3 .req Ht1 +Ht4 .req Ht2 +Ht34 .req Ht12 + +Ht5q .req Ht1q +Ht6q .req Ht2q +Ht56q .req Ht12q + +Ht5 .req Ht1 +Ht6 .req Ht2 +Ht56 .req Ht12 + +Ht7q .req Ht1q +Ht8q .req Ht2q +Ht78q .req Ht12q + +Ht7 .req Ht1 +Ht8 .req Ht2 +Ht78 .req Ht12 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v30 +tag_q .req q30 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + add ctr_tmp_w, ctr, #\loc + rev ctr_tmp_w, ctr_tmp_w + orr ctr_tmp, ivec_64_96, ctr_tmp, lsl #32 + stp ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end + cmp count, #1 + b.eq Lloop_unrolled_start_iter_1 + // Instructions: 143 + // Expected cycles: 68 + // Expected IPC: 2.10 + // + // Cycle bound: 68.0 + // IPC bound: 2.10 + // + // Wall time: 61.39s + // User time: 61.39s + // + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + ldr q6, [x0, #32] // *................................................................... + add w30, w13, #2 // *................................................................... + add w22, w13, #0 // *................................................................... + add w29, w13, #3 // *................................................................... + ldr q0, [x0, #48] // .*.................................................................. + add w8, w13, #UNROLL // .*.................................................................. + add w27, w13, #1 // .*.................................................................. + rev w26, w29 // .*.................................................................. + ldr q16, [x6, #32] // ..*................................................................. + orr x21, x12, x26, lsl #32 // ..*................................................................. + add w20, w8, #1 // ..*................................................................. + rev w26, w27 // ..*................................................................. + ldr q29, [x6] // ...*................................................................ + orr x27, x12, x26, lsl #32 // ...*................................................................ + add w25, w8, #3 // ...*................................................................ + stp x11, x21, [sp, #STACK_BASE_AES_ST + 48] // ...*................................................................ // @slothy:writes=stack_3 + rev64 v12.16B, v6.16B // ....*............................................................... + ldr q10, [x6, #16] // ....*............................................................... + rev w7, w25 // ....*............................................................... + add w26, w8, #0 // ....*............................................................... + ldr q2, [sp, #STACK_BASE_AES_ST + 48] // .....*.............................................................. // @slothy:reads=stack_3 + rev64 v14.16B, v0.16B // .....*.............................................................. + rev w24, w26 // .....*.............................................................. + orr x25, x12, x7, lsl #32 // .....*.............................................................. + pmull v3.1q, v12.1d, v16.1d // ......*............................................................. + ext v9.16B, v12.16B, v12.16B, #8 // ......*............................................................. + orr x28, x12, x24, lsl #32 // ......*............................................................. + rev w21, w30 // ......*............................................................. + mov d5, v14.d[1] // .......*............................................................ + pmull v8.1q, v14.1d, v29.1d // .......*............................................................ + stp x11, x28, [sp, #STACK_BASE_AES_ST] // .......*............................................................ // @slothy:writes=stack_0 + orr x23, x12, x21, lsl #32 // .......*............................................................ + pmull2 v29.1q, v14.2d, v29.2d // ........*........................................................... + eor v31.16B, v9.16B, v12.16B // ........*........................................................... + stp x11, x23, [sp, #STACK_BASE_AES_ST + 32] // ........*........................................................... // @slothy:writes=stack_2 + rev w23, w22 // ........*........................................................... + eor v9.16B, v8.16B, v3.16B // .........*.......................................................... + aesr v2.16b, v18.16b // .........*.......................................................... + add w13, w8, #UNROLL // .........*.......................................................... + stp x11, x27, [sp, #STACK_BASE_AES_ST + 16] // .........*.......................................................... // @slothy:writes=stack_1 + ldr q8, [sp, #STACK_BASE_AES_ST + 32] // ..........*......................................................... // @slothy:reads=stack_2 + pmull2 v17.1q, v31.2d, v10.2d // ..........*......................................................... + add w21, w8, #2 // ..........*......................................................... + ldr q13, [x0], #(4*16) // ...........*........................................................ + aesr v2.16b, v19.16b // ...........*........................................................ + orr x8, x12, x23, lsl #32 // ...........*........................................................ + ldr q4, [sp, #STACK_BASE_AES_ST] // ............*....................................................... // @slothy:reads=stack_0 + pmull2 v31.1q, v12.2d, v16.2d // ............*....................................................... + aesr v2.16b, v20.16b // .............*...................................................... + eor v14.8B, v5.8B, v14.8B // .............*...................................................... + stp x11, x8, [sp, #STACK_BASE_AES_ST] // .............*...................................................... // @slothy:writes=stack_0 + eor v16.16B, v29.16B, v31.16B // ..............*..................................................... + aesr v8.16b, v18.16b // ..............*..................................................... + ldr q5, [sp, #STACK_BASE_AES_ST] // ...............*.................................................... // @slothy:reads=stack_0 + pmull v3.1q, v14.1d, v10.1d // ...............*.................................................... + aesr v8.16b, v19.16b // ................*................................................... + rev64 v12.16B, v13.16B // ................*................................................... + ldr q11, [x0, #-48] // .................*.................................................. + aesr v4.16b, v18.16b // .................*.................................................. + ldr q31, [x0, #32] // ..................*................................................. + aesr v8.16b, v20.16b // ..................*................................................. + eor v10.16B, v3.16B, v17.16B // ...................*................................................ + aesr v5.16b, v18.16b // ...................*................................................ + aesr v8.16b, v21.16b // ....................*............................................... + eor v3.16B, v12.16B, v30.16B // ....................*............................................... + aesr v5.16b, v19.16b // .....................*.............................................. + rev64 v15.16B, v11.16B // .....................*.............................................. + aesr v8.16b, v22.16b // ......................*............................................. + stp x11, x25, [sp, #STACK_BASE_AES_ST + 48] // ......................*............................................. // @slothy:writes=stack_3 + rev w22, w21 // ......................*............................................. + aesr v5.16b, v20.16b // .......................*............................................ + mov d12, v15.d[1] // .......................*............................................ + orr x24, x12, x22, lsl #32 // .......................*............................................ + aesr v8.16b, v23.16b // ........................*........................................... + stp x11, x24, [sp, #STACK_BASE_AES_ST + 32] // ........................*........................................... // @slothy:writes=stack_2 + eor v29.8B, v12.8B, v15.8B // .........................*.......................................... + aesr v5.16b, v21.16b // .........................*.......................................... + aesr v8.16b, v24.16b // ..........................*......................................... + ldr q12, [sp, #STACK_BASE_AES_ST + 32] // ..........................*......................................... // @slothy:reads=stack_2 + aesr v5.16b, v22.16b // ...........................*........................................ + aesr v8.16b, v25.16b // ............................*....................................... + aesr v5.16b, v23.16b // .............................*...................................... + aesr v8.16b, v26.16b // ..............................*..................................... + aesr v5.16b, v24.16b // ...............................*.................................... + aese v8.16b, v27.16b // ................................*................................... + aesr v5.16b, v25.16b // .................................*.................................. + aesr v2.16b, v21.16b // ..................................*................................. + eor v8.16B, v8.16B, v28.16B // ..................................*................................. + aesr v12.16b, v18.16b // ...................................*................................ + aesr v2.16b, v22.16b // ....................................*............................... + eor v17.16B, v8.16B, v6.16B // ....................................*............................... + aesr v12.16b, v19.16b // .....................................*.............................. + ldr q6, [x6, #48] // .....................................*.............................. + str q17, [x2, #32] // ......................................*............................. + aesr v2.16b, v23.16b // ......................................*............................. + aesr v12.16b, v20.16b // .......................................*............................ + rev64 v8.16B, v31.16B // ........................................*........................... + aesr v2.16b, v24.16b // ........................................*........................... + aesr v12.16b, v21.16b // .........................................*.......................... + aesr v5.16b, v26.16b // ..........................................*......................... + aesr v12.16b, v22.16b // ...........................................*........................ + aese v5.16b, v27.16b // ............................................*....................... + aesr v12.16b, v23.16b // .............................................*...................... + aesr v4.16b, v19.16b // ..............................................*..................... + ext v1.16B, v3.16B, v3.16B, #8 // ..............................................*..................... + eor v14.16B, v5.16B, v28.16B // ...............................................*.................... + aesr v12.16b, v24.16b // ...............................................*.................... + eor v5.16B, v1.16B, v3.16B // ................................................*................... + aesr v2.16b, v25.16b // ................................................*................... + eor v14.16B, v14.16B, v13.16B // .................................................*.................. + aesr v12.16b, v25.16b // .................................................*.................. + aesr v4.16b, v20.16b // ..................................................*................. + aesr v12.16b, v26.16b // ...................................................*................ + ldr q13, [x6, #80] // ...................................................*................ + aesr v4.16b, v21.16b // ....................................................*............... + str q14, [x2], #(4*16) // ....................................................*............... + aese v12.16b, v27.16b // .....................................................*.............. + aesr v4.16b, v22.16b // ......................................................*............. + pmull2 v1.1q, v3.2d, v13.2d // .......................................................*............ + aesr v4.16b, v23.16b // ........................................................*........... + eor v14.16B, v12.16B, v28.16B // ........................................................*........... + ldr q17, [x6, #32] // .........................................................*.......... + aesr v2.16b, v26.16b // .........................................................*.......... + eor v30.16B, v14.16B, v31.16B // ..........................................................*......... + aesr v4.16b, v24.16b // ..........................................................*......... + ldr q14, [x6, #64] // ...........................................................*........ + aese v2.16b, v27.16b // ...........................................................*........ + str q30, [x2, #32] // ............................................................*....... + aesr v4.16b, v25.16b // ............................................................*....... + eor v2.16B, v2.16B, v28.16B // .............................................................*...... + pmull v31.1q, v15.1d, v6.1d // .............................................................*...... + aesr v4.16b, v26.16b // ..............................................................*..... + ldr q12, [x0], #(4*16) // ..............................................................*..... + pmull v13.1q, v3.1d, v13.1d // ...............................................................*.... + eor v9.16B, v9.16B, v31.16B // ...............................................................*.... + aese v4.16b, v27.16b // ................................................................*... + eor v2.16B, v2.16B, v0.16B // ................................................................*... + pmull2 v0.1q, v5.2d, v14.2d // .................................................................*.. + eor v9.16B, v9.16B, v13.16B // .................................................................*.. + pmull2 v30.1q, v15.2d, v6.2d // ..................................................................*. + eor v31.16B, v4.16B, v28.16B // ..................................................................*. + pmull v29.1q, v29.1d, v14.1d // ...................................................................* + ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ...................................................................* // @slothy:reads=stack_1 + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // add W, w13, #2 // *................................................................... + // add W, w13, #0 // *................................................................... + // rev W, W // ......*............................................................. + // rev W, W // ........*........................................................... + // orr X, x12, X, lsl #32 // .......*............................................................ + // stp x11, X, [sp, #STACK_BASE_AES_ST + 32] // ........*........................................................... + // orr X, x12, X, lsl #32 // ...........*........................................................ + // ldr Q, [sp, #STACK_BASE_AES_ST + 32] // ..........*......................................................... + // add W, w13, #3 // *................................................................... + // add W, w13, #1 // .*.................................................................. + // stp x11, X, [sp, #STACK_BASE_AES_ST] // .............*...................................................... + // add W, w13, #UNROLL // .*.................................................................. + // aesr V.16b, v18.16b // ..............*..................................................... + // rev W, W // .*.................................................................. + // orr X, x12, X, lsl #32 // ..*................................................................. + // aesr V.16b, v19.16b // ................*................................................... + // ldr Q, [sp, #STACK_BASE_AES_ST] // ...............*.................................................... + // aesr V.16b, v20.16b // ..................*................................................. + // aesr V.16b, v21.16b // ....................*............................................... + // ldr Q, [x0, #32] // *................................................................... + // aesr V.16b, v22.16b // ......................*............................................. + // aesr V.16b, v23.16b // ........................*........................................... + // aesr V.16b, v24.16b // ..........................*......................................... + // aesr V.16b, v25.16b // ............................*....................................... + // aesr V.16b, v26.16b // ..............................*..................................... + // aesr V.16b, v18.16b // ...................*................................................ + // aese V.16b, v27.16b // ................................*................................... + // stp x11, X, [sp, #STACK_BASE_AES_ST + 48] // ...*................................................................ + // aesr V.16b, v19.16b // .....................*.............................................. + // eor V.16B, V.16B, v28.16B // ..................................*................................. + // aesr V.16b, v20.16b // .......................*............................................ + // aesr V.16b, v21.16b // .........................*.......................................... + // eor V.16B, V.16B, V.16B // ....................................*............................... + // aesr V.16b, v22.16b // ...........................*........................................ + // aesr V.16b, v23.16b // .............................*...................................... + // rev64 V.16B, V.16B // ....*............................................................... + // ldr Q, [x6, #32] // ..*................................................................. + // str Q, [x2, #32] // ......................................*............................. + // aesr V.16b, v24.16b // ...............................*.................................... + // aesr V.16b, v25.16b // .................................*.................................. + // aesr V.16b, v26.16b // ..........................................*......................... + // aese V.16b, v27.16b // ............................................*....................... + // eor V.16B, V.16B, v28.16B // ...............................................*.................... + // ldr Q, [x0], #(4*16) // ...........*........................................................ + // add W, W, #2 // ..........*......................................................... + // pmull2 V.1q, V.2d, V.2d // ............*....................................................... + // ldr Q, [x0, #-16] // .*.................................................................. + // add W, W, #0 // ....*............................................................... + // rev W, W // ......................*............................................. + // rev W, W // .....*.............................................................. + // orr X, x12, X, lsl #32 // .......................*............................................ + // ldr Q, [x6] // ...*................................................................ + // pmull V.1q, V.1d, V.1d // ......*............................................................. + // rev W, W // ..*................................................................. + // stp x11, X, [sp, #STACK_BASE_AES_ST + 32] // ........................*........................................... + // orr X, x12, X, lsl #32 // ...*................................................................ + // orr X, x12, X, lsl #32 // ......*............................................................. + // rev64 V.16B, V.16B // .....*.............................................................. + // ldr Q, [sp, #STACK_BASE_AES_ST + 32] // ..........................*......................................... + // add W, W, #3 // ...*................................................................ + // add w20, W, #1 // ..*................................................................. + // stp x11, X, [sp, #STACK_BASE_AES_ST + 16] // .........*.......................................................... + // eor V.16B, V.16B, V.16B // .................................................*.................. + // pmull2 V.1q, V.2d, V.2d // ........*........................................................... + // stp x11, X, [sp, #STACK_BASE_AES_ST] // .......*............................................................ + // add w13, W, #UNROLL // .........*.......................................................... + // eor v16.16B, V.16B, V.16B // ..............*..................................................... + // aesr V.16b, v18.16b // ...................................*................................ + // rev W, W // ....*............................................................... + // orr X, x12, X, lsl #32 // .....*.............................................................. + // str Q, [x2], #(4*16) // ....................................................*............... + // aesr V.16b, v19.16b // .....................................*.............................. + // ldr Q, [sp, #STACK_BASE_AES_ST] // ............*....................................................... + // aesr V.16b, v20.16b // .......................................*............................ + // rev64 V.16B, V.16B // ................*................................................... + // aesr V.16b, v21.16b // .........................................*.......................... + // ldr Q, [x0, #32] // ..................*................................................. + // aesr V.16b, v22.16b // ...........................................*........................ + // ldr Q, [x6, #16] // ....*............................................................... + // aesr V.16b, v23.16b // .............................................*...................... + // ldr Q, [x6, #80] // ...................................................*................ + // aesr V.16b, v24.16b // ...............................................*.................... + // pmull V.1q, V.1d, V.1d // .......*............................................................ + // ext V.16B, V.16B, V.16B, #8 // ......*............................................................. + // aesr V.16b, v25.16b // .................................................*.................. + // aesr V.16b, v26.16b // ...................................................*................ + // ldr Q, [sp, #STACK_BASE_AES_ST + 48] // .....*.............................................................. + // aesr V.16b, v18.16b // .................*.................................................. + // eor V.16B, V.16B, V.16B // .........*.......................................................... + // aese V.16b, v27.16b // .....................................................*.............. + // stp x11, X, [sp, #STACK_BASE_AES_ST + 48] // ......................*............................................. + // aesr V.16b, v19.16b // ..............................................*..................... + // aesr V.16b, v18.16b // .........*.......................................................... + // eor V.16B, V.16B, v28.16B // ........................................................*........... + // aesr V.16b, v20.16b // ..................................................*................. + // eor V.16B, V.16B, V.16B // ........*........................................................... + // aesr V.16b, v19.16b // ...........*........................................................ + // ldr q11, [x0, #-48] // .................*.................................................. + // aesr V.16b, v21.16b // ....................................................*............... + // pmull2 V.1q, V.2d, V.2d // ..........*......................................................... + // eor V.16B, V.16B, V.16B // ..........................................................*......... + // aesr V.16b, v22.16b // ......................................................*............. + // aesr V.16b, v20.16b // .............*...................................................... + // ldr Q, [x6, #48] // .....................................*.............................. + // eor V.16B, V.16B, v30.16B // ....................*............................................... + // aesr V.16b, v23.16b // ........................................................*........... + // mov D, V.d[1] // .......*............................................................ + // aesr V.16b, v21.16b // ..................................*................................. + // pmull V.1q, V.1d, V.1d // ...............................................................*.... + // rev64 V.16B, v11.16B // .....................*.............................................. + // rev64 v8.16B, V.16B // ........................................*........................... + // aesr V.16b, v22.16b // ....................................*............................... + // ldr q17, [x6, #32] // .........................................................*.......... + // pmull2 v1.1q, V.2d, V.2d // .......................................................*............ + // str Q, [x2, #32] // ............................................................*....... + // aesr V.16b, v23.16b // ......................................*............................. + // pmull V.1q, V.1d, V.1d // .............................................................*...... + // ldr Q, [x6, #64] // ...........................................................*........ + // eor V.8B, V.8B, V.8B // .............*...................................................... + // aesr V.16b, v24.16b // ........................................*........................... + // aesr V.16b, v24.16b // ..........................................................*......... + // aesr V.16b, v25.16b // ................................................*................... + // eor V.16B, V.16B, V.16B // ...............................................................*.... + // ext V.16B, V.16B, V.16B, #8 // ..............................................*..................... + // aesr V.16b, v25.16b // ............................................................*....... + // eor v9.16B, V.16B, V.16B // .................................................................*.. + // aesr V.16b, v26.16b // .........................................................*.......... + // pmull2 v30.1q, V.2d, V.2d // ..................................................................*. + // mov D, V.d[1] // .......................*............................................ + // eor V.16B, V.16B, V.16B // ................................................*................... + // aese V.16b, v27.16b // ...........................................................*........ + // aesr V.16b, v26.16b // ..............................................................*..... + // eor V.8B, V.8B, V.8B // .........................*.......................................... + // pmull2 v0.1q, V.2d, V.2d // .................................................................*.. + // eor V.16B, V.16B, v28.16B // .............................................................*...... + // ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ...................................................................* + // aese V.16b, v27.16b // ................................................................*... + // eor v2.16B, V.16B, V.16B // ................................................................*... + // pmull V.1q, V.1d, V.1d // ...............*.................................................... + // eor v31.16B, V.16B, v28.16B // ..................................................................*. + // pmull v29.1q, V.1d, V.1d // ...................................................................* + // eor v10.16B, V.16B, V.16B // ...................*................................................ + // ldr q12, [x0], #(4*16) // ..............................................................*..... + + sub count, count, #2 +cbz count, Lloop_unrolled_start_postamble +Lloop_unrolled_start: + // Instructions: 128 + // Expected cycles: 56 + // Expected IPC: 2.29 + // + // Cycle bound: 54.0 + // IPC bound: 2.37 + // + // Wall time: 3601.96s + // User time: 3601.96s + // + // ------------------ cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|----- + aesr v4.16b, v18.16b // l....................................................... + eor v16.16B, v16.16B, v30.16B // l....................................................... + add w26, w13, #2 // e....................................................... + pmull2 v14.1q, v8.2d, v17.2d // .*...................................................... + ldr q6, [x0, #-16] // .*...................................................... + add w25, w13, #0 // .e...................................................... + rev w22, w26 // .e...................................................... + eor v30.16B, v16.16B, v1.16B // ..l..................................................... + aesr v4.16b, v19.16b // ..l..................................................... + rev w24, w25 // ..e..................................................... + orr x28, x12, x22, lsl #32 // ..e..................................................... + ldr q3, [x6] // ...*.................................................... + pmull v13.1q, v8.1d, v17.1d // ...*.................................................... + rev w22, w20 // ...*.................................................... + stp x11, x28, [sp, #STACK_BASE_AES_ST + 32] // ...e.................................................... // @slothy:writes=stack_2 + aesr v4.16b, v20.16b // ....l................................................... + eor v1.16B, v10.16B, v29.16B // ....l................................................... + orr x22, x12, x22, lsl #32 // ....*................................................... + orr x25, x12, x24, lsl #32 // ....e................................................... + rev64 v15.16B, v6.16B // .....*.................................................. + ldr q10, [sp, #STACK_BASE_AES_ST + 32] // .....e.................................................. // @slothy:reads=stack_2 + add w24, w13, #3 // .....e.................................................. + eor v1.16B, v1.16B, v0.16B // ......l................................................. + aesr v4.16b, v21.16b // ......l................................................. + add w20, w13, #1 // ......e................................................. + stp x11, x22, [sp, #STACK_BASE_AES_ST + 16] // ......*................................................. // @slothy:writes=stack_1 + eor v29.16B, v31.16B, v12.16B // .......*................................................ + pmull2 v31.1q, v15.2d, v3.2d // .......*................................................ + stp x11, x25, [sp, #STACK_BASE_AES_ST] // .......e................................................ // @slothy:writes=stack_0 + add w13, w13, #UNROLL // .......e................................................ + eor v0.16B, v9.16B, v30.16B // ........l............................................... + aesr v4.16b, v22.16b // ........l............................................... + eor v16.16B, v31.16B, v14.16B // .........*.............................................. + aesr v10.16b, v18.16b // .........e.............................................. + rev w22, w24 // .........e.............................................. + eor v14.16B, v1.16B, v0.16B // ..........l............................................. + aesr v4.16b, v23.16b // ..........l............................................. + orr x29, x12, x22, lsl #32 // ..........e............................................. + str q29, [x2], #(4*16) // ...........*............................................ + aesr v10.16b, v19.16b // ...........e............................................ + ext v1.16B, v30.16B, v30.16B, #8 // ............l........................................... + aesr v4.16b, v24.16b // ............l........................................... + ldr q29, [sp, #STACK_BASE_AES_ST] // .............e.......................................... // @slothy:reads=stack_0 + aesr v10.16b, v20.16b // .............e.......................................... + rev64 v17.16B, v12.16B // ..............*......................................... + pmull v31.1q, v30.1d, v7.1d // ..............l......................................... + str q2, [x2, #-80] // ...............l........................................ + aesr v10.16b, v21.16b // ...............e........................................ + ldr q5, [x0, #32] // ................e....................................... + aesr v4.16b, v25.16b // ................l....................................... + eor v2.16B, v1.16B, v31.16B // .................l...................................... + aesr v10.16b, v22.16b // .................e...................................... + aesr v4.16b, v26.16b // ..................l..................................... + ldr q12, [x6, #16] // ..................*..................................... + eor v2.16B, v14.16B, v2.16B // ...................l.................................... + aesr v10.16b, v23.16b // ...................e.................................... + ldr q1, [x6, #80] // ....................*................................... + aese v4.16b, v27.16b // ....................l................................... + aesr v10.16b, v24.16b // .....................e.................................. + ext v0.16B, v2.16B, v2.16B, #8 // .....................l.................................. + pmull v30.1q, v15.1d, v3.1d // ......................*................................. + eor v14.16B, v4.16B, v28.16B // ......................l................................. + ext v4.16B, v8.16B, v8.16B, #8 // .......................*................................ + aesr v10.16b, v25.16b // .......................e................................ + eor v14.16B, v14.16B, v11.16B // ........................l............................... + pmull v31.1q, v2.1d, v7.1d // ........................l............................... + aesr v10.16b, v26.16b // .........................e.............................. + ldr q2, [sp, #STACK_BASE_AES_ST + 48] // .........................*.............................. // @slothy:reads=stack_3 + aesr v29.16b, v18.16b // ..........................e............................. + eor v3.16B, v9.16B, v31.16B // ..........................l............................. + eor v9.16B, v30.16B, v13.16B // ...........................*............................ + aese v10.16b, v27.16b // ...........................e............................ + stp x11, x29, [sp, #STACK_BASE_AES_ST + 48] // ...........................e............................ // @slothy:writes=stack_3 + aesr v29.16b, v19.16b // ............................e........................... + eor v30.16B, v3.16B, v0.16B // ............................l........................... + aesr v2.16b, v18.16b // .............................*.......................... + eor v31.16B, v10.16B, v28.16B // .............................e.......................... + aesr v29.16b, v20.16b // ..............................e......................... + eor v10.16B, v4.16B, v8.16B // ..............................*......................... + str q14, [x2, #-112] // ...............................l........................ + aesr v2.16b, v19.16b // ...............................*........................ + ldr q11, [x0, #-48] // ................................*....................... + aesr v29.16b, v21.16b // ................................e....................... + pmull2 v3.1q, v10.2d, v12.2d // .................................*...................... + eor v13.16B, v31.16B, v5.16B // .................................e...................... + aesr v29.16b, v22.16b // ..................................e..................... + ext v30.16B, v30.16B, v30.16B, #8 // ..................................l..................... + aesr v2.16b, v20.16b // ...................................*.................... + ldr q31, [x6, #48] // ...................................*.................... + eor v10.16B, v17.16B, v30.16B // ....................................*................... + aesr v29.16b, v23.16b // ....................................e................... + mov d4, v15.d[1] // .....................................*.................. + aesr v2.16b, v21.16b // .....................................*.................. + pmull v30.1q, v10.1d, v1.1d // ......................................*................. + rev64 v0.16B, v11.16B // ......................................*................. + rev64 v8.16B, v5.16B // .......................................e................ + aesr v2.16b, v22.16b // .......................................*................ + ldr q17, [x6, #32] // ........................................e............... + pmull2 v1.1q, v10.2d, v1.2d // ........................................*............... + str q13, [x2, #32] // .........................................e.............. + aesr v2.16b, v23.16b // .........................................*.............. + pmull v14.1q, v0.1d, v31.1d // ..........................................*............. + ldr q13, [x6, #64] // ..........................................*............. + eor v15.8B, v4.8B, v15.8B // ...........................................*............ + aesr v2.16b, v24.16b // ...........................................*............ + aesr v29.16b, v24.16b // ............................................e........... + aesr v2.16b, v25.16b // .............................................*.......... + eor v9.16B, v9.16B, v14.16B // .............................................*.......... + ext v4.16B, v10.16B, v10.16B, #8 // ..............................................*......... + aesr v29.16b, v25.16b // ..............................................e......... + eor v9.16B, v9.16B, v30.16B // ...............................................*........ + aesr v2.16b, v26.16b // ...............................................*........ + pmull2 v30.1q, v0.2d, v31.2d // ................................................*....... + mov d31, v0.d[1] // ................................................*....... + eor v4.16B, v4.16B, v10.16B // .................................................*...... + aese v2.16b, v27.16b // .................................................*...... + aesr v29.16b, v26.16b // ..................................................e..... + eor v14.8B, v31.8B, v0.8B // ..................................................*..... + pmull2 v0.1q, v4.2d, v13.2d // ...................................................*.... + eor v31.16B, v2.16B, v28.16B // ...................................................*.... + ldr q4, [sp, #STACK_BASE_AES_ST + 16] // ....................................................*... // @slothy:reads=stack_1 + aese v29.16b, v27.16b // ....................................................e... + eor v2.16B, v31.16B, v6.16B // .....................................................*.. + pmull v15.1q, v15.1d, v12.1d // .....................................................*.. + eor v31.16B, v29.16B, v28.16B // ......................................................e. + pmull v29.1q, v14.1d, v13.1d // ......................................................*. + eor v10.16B, v15.16B, v3.16B // .......................................................* + ldr q12, [x0], #(4*16) // .......................................................e + + // ---------------------------------------------------------------- cycle (expected) ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------------- + // ldr q12, [x6] // ...~....................................................'..*....................................................'..~............................... + // ldr q13, [x6, #32] // ........................................e...............'.......................................~...............'.................................. + // ldr q14, [x6, #16] // ..................~.....................................'.................*.....................................'.................~................ + // ldr q29, [x0, #(3*16)] // .~......................................................'*......................................................'~................................. + // add w14, w13, #3 // .....e..................................................'....~..................................................'....~............................. + // rev w14, w14 // .........e..............................................'........~..............................................'........~......................... + // orr x14, x12, x14, lsl #32 // ..........e.............................................'.........~.............................................'.........~........................ + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 3*16)] // ...........................e............................'..........................~............................'..........................~....... + // ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // .........................~..............................'........................*..............................'........................~......... + // aesr v0.16b, v18.16b // .............................~..........................'............................*..........................'............................~..... + // aesr v0.16b, v19.16b // ...............................~........................'..............................*........................'..............................~... + // aesr v0.16b, v20.16b // ...................................~....................'..................................*....................'.................................. + // aesr v0.16b, v21.16b // .....................................~..................'....................................*..................'.................................. + // aesr v0.16b, v22.16b // .......................................~................'......................................*................'.................................. + // aesr v0.16b, v23.16b // .........................................~..............'........................................*..............'.................................. + // aesr v0.16b, v24.16b // ...........................................~............'..........................................*............'.................................. + // aesr v0.16b, v25.16b // .............................................~..........'............................................*..........'.................................. + // aesr v0.16b, v26.16b // ...............................................~........'..............................................*........'.................................. + // aese v0.16b, v27.16b // .................................................~......'................................................*......'.................................. + // eor v0.16b, v0.16b, v28.16b // ...................................................~....'..................................................*....'.................................. + // eor v0.16b, v0.16b, v29.16b // .....................................................~..'....................................................*..'.................................. + // str q0, [x2, #(3*16)] // ...............~........................................'..............~........................................'..............l................... + // rev64 v29.16b, v29.16b // .....~..................................................'....*..................................................'....~............................. + // pmull v8.1q, v29.1d, v12.1d // ......................~.................................'.....................*.................................'.....................~............ + // pmull2 v9.1q, v29.2d, v12.2d // .......~................................................'......*................................................'......~........................... + // mov d11, v29.d[1] // .....................................~..................'....................................*..................'.................................. + // eor v11.8b, v11.8b, v29.8b // ...........................................~............'..........................................*............'.................................. + // pmull v10.1q, v11.1d, v14.1d // .....................................................~..'....................................................*..'.................................. + // ldr q29, [x0, #(2*16)] // ................e.......................................'...............~.......................................'...............~.................. + // add w14, w13, #2 // e.......................................................~.......................................................~.................................. + // rev w14, w14 // .e......................................................'~......................................................'~................................. + // orr x14, x12, x14, lsl #32 // ..e.....................................................'.~.....................................................'.~................................ + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 2*16)] // ...e....................................................'..~....................................................'..~............................... + // ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // .....e..................................................'....~..................................................'....~............................. + // aesr v0.16b, v18.16b // .........e..............................................'........~..............................................'........~......................... + // aesr v0.16b, v19.16b // ...........e............................................'..........~............................................'..........~....................... + // aesr v0.16b, v20.16b // .............e..........................................'............~..........................................'............~..................... + // aesr v0.16b, v21.16b // ...............e........................................'..............~........................................'..............~................... + // aesr v0.16b, v22.16b // .................e......................................'................~......................................'................~................. + // aesr v0.16b, v23.16b // ...................e....................................'..................~....................................'..................~............... + // aesr v0.16b, v24.16b // .....................e..................................'....................~..................................'....................~............. + // aesr v0.16b, v25.16b // .......................e................................'......................~................................'......................~........... + // aesr v0.16b, v26.16b // .........................e..............................'........................~..............................'........................~......... + // aese v0.16b, v27.16b // ...........................e............................'..........................~............................'..........................~....... + // eor v0.16b, v0.16b, v28.16b // .............................e..........................'............................~..........................'............................~..... + // eor v0.16b, v0.16b, v29.16b // .................................e......................'................................~......................'................................~. + // str q0, [x2, #(2*16)] // .........................................e..............'........................................~..............'.................................. + // rev64 v29.16b, v29.16b // .......................................e................'......................................~................'.................................. + // pmull v11.1q, v29.1d, v13.1d // ...~....................................................'..*....................................................'..~............................... + // eor v8.16b, v8.16b, v11.16b // ...........................~............................'..........................*............................'..........................~....... + // pmull2 v11.1q, v29.2d, v13.2d // .~......................................................'*......................................................'~................................. + // eor v9.16b, v9.16b, v11.16b // .........~..............................................'........*..............................................'........~......................... + // ext v11.16b, v29.16b, v29.16b, #8 // .......................~................................'......................*................................'......................~........... + // eor v11.16b, v11.16b, v29.16b // ..............................~.........................'.............................*.........................'.............................~.... + // pmull2 v11.1q, v11.2d, v14.2d // .................................~......................'................................*......................'................................~. + // eor v10.16b, v10.16b, v11.16b // .......................................................~'......................................................*'.................................. + // ldr q12, [x6, #48] // ...................................~....................'..................................*....................'.................................. + // ldr q13, [x6, #80] // ....................~...................................'...................*...................................'...................~.............. + // ldr q14, [x6, #64] // ..........................................~.............'.........................................*.............'.................................. + // ldr q29, [x0, #(1*16)] // ................................~.......................'...............................*.......................'...............................~.. + // add w14, w13, #1 // ......e.................................................'.....~.................................................'.....~............................ + // rev w14, w14 // ...~....................................................'..*....................................................'..~............................... + // orr x14, x12, x14, lsl #32 // ....~...................................................'...*...................................................'...~.............................. + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 1*16)] // ......~.................................................'.....*.................................................'.....~............................ + // ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // ....................................................~...'...................................................*...'.................................. + // aesr v0.16b, v18.16b // ~.......................................................~.......................................................l.................................. + // aesr v0.16b, v19.16b // ..~.....................................................'.~.....................................................'.l................................ + // aesr v0.16b, v20.16b // ....~...................................................'...~...................................................'...l.............................. + // aesr v0.16b, v21.16b // ......~.................................................'.....~.................................................'.....l............................ + // aesr v0.16b, v22.16b // ........~...............................................'.......~...............................................'.......l.......................... + // aesr v0.16b, v23.16b // ..........~.............................................'.........~.............................................'.........l........................ + // aesr v0.16b, v24.16b // ............~...........................................'...........~...........................................'...........l...................... + // aesr v0.16b, v25.16b // ................~.......................................'...............~.......................................'...............l.................. + // aesr v0.16b, v26.16b // ..................~.....................................'.................~.....................................'.................l................ + // aese v0.16b, v27.16b // ....................~...................................'...................~...................................'...................l.............. + // eor v0.16b, v0.16b, v28.16b // ......................~.................................'.....................~.................................'.....................l............ + // eor v0.16b, v0.16b, v29.16b // ........................~...............................'.......................~...............................'.......................l.......... + // str q0, [x2, #(1*16)] // ...............................~........................'..............................~........................'..............................l... + // rev64 v29.16b, v29.16b // ......................................~.................'.....................................*.................'.................................. + // pmull v11.1q, v29.1d, v12.1d // ..........................................~.............'.........................................*.............'.................................. + // eor v8.16b, v8.16b, v11.16b // .............................................~..........'............................................*..........'.................................. + // pmull2 v11.1q, v29.2d, v12.2d // ................................................~.......'...............................................*.......'.................................. + // eor v9.16b, v9.16b, v11.16b // ~.......................................................~.......................................................l.................................. + // mov d11, v29.d[1] // ................................................~.......'...............................................*.......'.................................. + // eor v11.8b, v11.8b, v29.8b // ..................................................~.....'.................................................*.....'.................................. + // pmull v11.1q, v11.1d, v14.1d // ......................................................~.'.....................................................*.'.................................. + // eor v10.16b, v10.16b, v11.16b // ....~...................................................'...~...................................................'...l.............................. + // ldr q29, [x0], #(4*16) // .......................................................e'......................................................~'.................................. + // add w14, w13, #0 // .e......................................................'~......................................................'~................................. + // rev w14, w14 // ..e.....................................................'.~.....................................................'.~................................ + // orr x14, x12, x14, lsl #32 // ....e...................................................'...~...................................................'...~.............................. + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 0*16)] // .......e................................................'......~................................................'......~........................... + // ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // .............e..........................................'............~..........................................'............~..................... + // aesr v0.16b, v18.16b // ..........................e.............................'.........................~.............................'.........................~........ + // aesr v0.16b, v19.16b // ............................e...........................'...........................~...........................'...........................~...... + // aesr v0.16b, v20.16b // ..............................e.........................'.............................~.........................'.............................~.... + // aesr v0.16b, v21.16b // ................................e.......................'...............................~.......................'...............................~.. + // aesr v0.16b, v22.16b // ..................................e.....................'.................................~.....................'.................................. + // aesr v0.16b, v23.16b // ....................................e...................'...................................~...................'.................................. + // aesr v0.16b, v24.16b // ............................................e...........'...........................................~...........'.................................. + // aesr v0.16b, v25.16b // ..............................................e.........'.............................................~.........'.................................. + // aesr v0.16b, v26.16b // ..................................................e.....'.................................................~.....'.................................. + // aese v0.16b, v27.16b // ....................................................e...'...................................................~...'.................................. + // eor v0.16b, v0.16b, v28.16b // ......................................................e.'.....................................................~.'.................................. + // eor v0.16b, v0.16b, v29.16b // .......~................................................'......*................................................'......~........................... + // str q0, [x2], #(4*16) // ...........~............................................'..........*............................................'..........~....................... + // rev64 v29.16b, v29.16b // ..............~.........................................'.............*.........................................'.............~.................... + // eor v29.16b, v29.16b, v30.16b // ....................................~...................'...................................*...................'.................................. + // pmull v11.1q, v29.1d, v13.1d // ......................................~.................'.....................................*.................'.................................. + // eor v8.16b, v8.16b, v11.16b // ...............................................~........'..............................................*........'.................................. + // pmull2 v11.1q, v29.2d, v13.2d // ........................................~...............'.......................................*...............'.................................. + // eor v9.16b, v9.16b, v11.16b // ..~.....................................................'.~.....................................................'.l................................ + // ext v11.16b, v29.16b, v29.16b, #8 // ..............................................~.........'.............................................*.........'.................................. + // eor v11.16b, v11.16b, v29.16b // .................................................~......'................................................*......'.................................. + // pmull2 v11.1q, v11.2d, v14.2d // ...................................................~....'..................................................*....'.................................. + // eor v10.16b, v10.16b, v11.16b // ......~.................................................'.....~.................................................'.....l............................ + // eor v0.16b, v8.16b, v9.16b // ........~...............................................'.......~...............................................'.......l.......................... + // pmull v1.1q, v9.1d, v7.1d // ..............~.........................................'.............~.........................................'.............l.................... + // ext v9.16b, v9.16b, v9.16b, #8 // ............~...........................................'...........~...........................................'...........l...................... + // eor v10.16b, v10.16b, v0.16b // ..........~.............................................'.........~.............................................'.........l........................ + // eor v1.16b, v9.16b, v1.16b // .................~......................................'................~......................................'................l................. + // eor v10.16b, v10.16b, v1.16b // ...................~....................................'..................~....................................'..................l............... + // pmull v9.1q, v10.1d, v7.1d // ........................~...............................'.......................~...............................'.......................l.......... + // eor v8.16b, v8.16b, v9.16b // ..........................~.............................'.........................~.............................'.........................l........ + // ext v10.16b, v10.16b, v10.16b, #8 // .....................~..................................'....................~..................................'....................l............. + // eor v30.16b, v8.16b, v10.16b // ............................~...........................'...........................~...........................'...........................l...... + // ext v30.16b, v30.16b, v30.16b, #8 // ..................................~.....................'.................................~.....................'.................................l + // add w13, w13, #UNROLL // .......e................................................'......~................................................'......~........................... + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_start_postamble:// end of loop kernel + // Instructions: 113 + // Expected cycles: 55 + // Expected IPC: 2.05 + // + // Cycle bound: 55.0 + // IPC bound: 2.05 + // + // Wall time: 261.71s + // User time: 261.71s + // + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + eor v14.16B, v10.16B, v29.16B // *...................................................... + ldr q3, [x0, #-16] // *...................................................... + rev w28, w20 // *...................................................... + aesr v4.16b, v18.16b // .*..................................................... + eor v29.16B, v16.16B, v30.16B // .*..................................................... + orr x10, x12, x28, lsl #32 // .*..................................................... + ldr q30, [x6] // ..*.................................................... + pmull v6.1q, v8.1d, v17.1d // ..*.................................................... + stp x11, x10, [sp, #STACK_BASE_AES_ST + 16] // ..*.................................................... // @slothy:writes=stack_1 + aesr v4.16b, v19.16b // ...*................................................... + eor v29.16B, v29.16B, v1.16B // ...*................................................... + rev64 v15.16B, v3.16B // ....*.................................................. + ldr q1, [sp, #STACK_BASE_AES_ST + 16] // ....*.................................................. // @slothy:reads=stack_1 + aesr v4.16b, v20.16b // .....*................................................. + eor v10.16B, v9.16B, v29.16B // .....*................................................. + pmull v5.1q, v15.1d, v30.1d // ......*................................................ + eor v16.16B, v14.16B, v0.16B // ......*................................................ + ext v14.16B, v29.16B, v29.16B, #8 // .......*............................................... + pmull v13.1q, v29.1d, v7.1d // .......*............................................... + ldr q0, [x6, #80] // ........*.............................................. + aesr v1.16b, v18.16b // ........*.............................................. + eor v29.16B, v14.16B, v13.16B // .........*............................................. + eor v13.16B, v16.16B, v10.16B // .........*............................................. + ext v10.16B, v8.16B, v8.16B, #8 // ..........*............................................ + aesr v1.16b, v19.16b // ..........*............................................ + pmull2 v16.1q, v8.2d, v17.2d // ...........*........................................... + eor v14.16B, v13.16B, v29.16B // ...........*........................................... + rev64 v13.16B, v12.16B // ............*.......................................... + aesr v1.16b, v20.16b // ............*.......................................... + ldr q17, [x6, #16] // .............*......................................... + pmull v29.1q, v14.1d, v7.1d // .............*......................................... + aesr v1.16b, v21.16b // ..............*........................................ + ext v14.16B, v14.16B, v14.16B, #8 // ..............*........................................ + pmull2 v30.1q, v15.2d, v30.2d // ...............*....................................... + eor v29.16B, v9.16B, v29.16B // ...............*....................................... + mov d9, v15.d[1] // ................*...................................... + aesr v1.16b, v22.16b // ................*...................................... + eor v29.16B, v29.16B, v14.16B // .................*..................................... + aesr v4.16b, v21.16b // .................*..................................... + aesr v1.16b, v23.16b // ..................*.................................... + eor v9.8B, v9.8B, v15.8B // ..................*.................................... + aesr v4.16b, v22.16b // ...................*................................... + ext v14.16B, v29.16B, v29.16B, #8 // ...................*................................... + eor v10.16B, v10.16B, v8.16B // ....................*.................................. + aesr v1.16b, v24.16b // ....................*.................................. + pmull v29.1q, v9.1d, v17.1d // .....................*................................. + eor v13.16B, v13.16B, v14.16B // .....................*................................. + str q2, [x2, #-16] // ......................*................................ + aesr v1.16b, v25.16b // ......................*................................ + aesr v4.16b, v23.16b // .......................*............................... + ldr q14, [sp, #STACK_BASE_AES_ST + 48] // .......................*............................... // @slothy:reads=stack_3 + ldr q15, [x0, #-48] // ........................*.............................. + aesr v1.16b, v26.16b // ........................*.............................. + pmull v9.1q, v13.1d, v0.1d // .........................*............................. + eor v2.16B, v30.16B, v16.16B // .........................*............................. + pmull2 v17.1q, v10.2d, v17.2d // ..........................*............................ + ldr q10, [x6, #48] // ..........................*............................ + aesr v14.16b, v18.16b // ...........................*........................... + eor v6.16B, v5.16B, v6.16B // ...........................*........................... + aesr v4.16b, v24.16b // ............................*.......................... + rev64 v30.16B, v15.16B // ............................*.......................... + eor v29.16B, v29.16B, v17.16B // .............................*......................... + aesr v14.16b, v19.16b // .............................*......................... + pmull2 v17.1q, v30.2d, v10.2d // ..............................*........................ + mov d16, v30.d[1] // ..............................*........................ + aesr v14.16b, v20.16b // ...............................*....................... + ldr q8, [x6, #64] // ...............................*....................... + pmull v10.1q, v30.1d, v10.1d // ................................*...................... + eor v16.8B, v16.8B, v30.8B // ................................*...................... + aesr v14.16b, v21.16b // .................................*..................... + ext v5.16B, v13.16B, v13.16B, #8 // .................................*..................... + eor v10.16B, v6.16B, v10.16B // ..................................*.................... + pmull2 v30.1q, v13.2d, v0.2d // ..................................*.................... + eor v2.16B, v2.16B, v17.16B // ...................................*................... + aesr v14.16b, v22.16b // ...................................*................... + eor v10.16B, v10.16B, v9.16B // ....................................*.................. + pmull v9.1q, v16.1d, v8.1d // ....................................*.................. + aese v1.16b, v27.16b // .....................................*................. + eor v17.16B, v5.16B, v13.16B // .....................................*................. + eor v13.16B, v2.16B, v30.16B // ......................................*................ + aesr v14.16b, v23.16b // ......................................*................ + pmull2 v2.1q, v17.2d, v8.2d // .......................................*............... + eor v29.16B, v29.16B, v9.16B // .......................................*............... + eor v9.16B, v10.16B, v13.16B // ........................................*.............. + pmull v5.1q, v13.1d, v7.1d // ........................................*.............. + aesr v14.16b, v24.16b // .........................................*............. + eor v29.16B, v29.16B, v2.16B // .........................................*............. + aesr v4.16b, v25.16b // ..........................................*............ + ext v17.16B, v13.16B, v13.16B, #8 // ..........................................*............ + aesr v14.16b, v25.16b // ...........................................*........... + eor v16.16B, v29.16B, v9.16B // ...........................................*........... + eor v30.16B, v17.16B, v5.16B // ............................................*.......... + aesr v4.16b, v26.16b // ............................................*.......... + aesr v14.16b, v26.16b // .............................................*......... + eor v9.16B, v1.16B, v28.16B // .............................................*......... + aese v4.16b, v27.16b // ..............................................*........ + eor v6.16B, v16.16B, v30.16B // ..............................................*........ + aese v14.16b, v27.16b // ...............................................*....... + eor v29.16B, v9.16B, v15.16B // ...............................................*....... + ext v9.16B, v6.16B, v6.16B, #8 // ................................................*...... + pmull v16.1q, v6.1d, v7.1d // ................................................*...... + eor v14.16B, v14.16B, v28.16B // .................................................*..... + eor v17.16B, v31.16B, v12.16B // .................................................*..... + eor v4.16B, v4.16B, v28.16B // ..................................................*.... + eor v10.16B, v10.16B, v16.16B // ..................................................*.... + eor v30.16B, v14.16B, v3.16B // ...................................................*... + str q17, [x2], #(4*16) // ...................................................*... + eor v31.16B, v10.16B, v9.16B // ....................................................*.. + eor v1.16B, v4.16B, v11.16B // ....................................................*.. + str q30, [x2, #-16] // .....................................................*. + str q29, [x2, #-48] // .....................................................*. + ext v30.16B, v31.16B, v31.16B, #8 // ......................................................* + str q1, [x2, #-112] // ......................................................* + + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // aesr v4.16b, v18.16b // .*..................................................... + // eor V.16B, v16.16B, v30.16B // .*..................................................... + // pmull2 V.1q, v8.2d, v17.2d // ...........*........................................... + // ldr Q, [x0, #-16] // *...................................................... + // eor V.16B, V.16B, v1.16B // ...*................................................... + // aesr v4.16b, v19.16b // ...*................................................... + // ldr Q, [x6] // ..*.................................................... + // pmull V.1q, v8.1d, v17.1d // ..*.................................................... + // rev W, w20 // *...................................................... + // aesr v4.16b, v20.16b // .....*................................................. + // eor V.16B, v10.16B, v29.16B // *...................................................... + // orr X, x12, X, lsl #32 // .*..................................................... + // rev64 V.16B, V.16B // ....*.................................................. + // eor V.16B, V.16B, v0.16B // ......*................................................ + // aesr v4.16b, v21.16b // .................*..................................... + // stp x11, X, [sp, #STACK_BASE_AES_ST + 16] // ..*.................................................... + // eor V.16B, v31.16B, v12.16B // .................................................*..... + // pmull2 V.1q, V.2d, V.2d // ...............*....................................... + // eor V.16B, v9.16B, V.16B // .....*................................................. + // aesr v4.16b, v22.16b // ...................*................................... + // eor V.16B, V.16B, V.16B // .........................*............................. + // eor V.16B, V.16B, V.16B // .........*............................................. + // aesr v4.16b, v23.16b // .......................*............................... + // str Q, [x2], #(4*16) // ...................................................*... + // ext V.16B, V.16B, V.16B, #8 // .......*............................................... + // aesr v4.16b, v24.16b // ............................*.......................... + // rev64 V.16B, v12.16B // ............*.......................................... + // pmull V.1q, V.1d, v7.1d // .......*............................................... + // str q2, [x2, #-80] // ......................*................................ + // aesr v4.16b, v25.16b // ..........................................*............ + // eor V.16B, V.16B, V.16B // .........*............................................. + // aesr v4.16b, v26.16b // ............................................*.......... + // ldr Q, [x6, #16] // .............*......................................... + // eor V.16B, V.16B, V.16B // ...........*........................................... + // ldr Q, [x6, #80] // ........*.............................................. + // aese v4.16b, v27.16b // ..............................................*........ + // ext V.16B, V.16B, V.16B, #8 // ..............*........................................ + // pmull V.1q, V.1d, V.1d // ......*................................................ + // eor V.16B, v4.16B, v28.16B // ..................................................*.... + // ext V.16B, v8.16B, v8.16B, #8 // ..........*............................................ + // eor V.16B, V.16B, v11.16B // ....................................................*.. + // pmull V.1q, V.1d, v7.1d // .............*......................................... + // ldr Q, [sp, #STACK_BASE_AES_ST + 48] // .......................*............................... + // eor V.16B, v9.16B, V.16B // ...............*....................................... + // eor V.16B, V.16B, V.16B // ...........................*........................... + // eor V.16B, V.16B, V.16B // .................*..................................... + // aesr V.16b, v18.16b // ...........................*........................... + // eor V.16B, V.16B, v8.16B // ....................*.................................. + // str Q, [x2, #-112] // ......................................................* + // aesr V.16b, v19.16b // .............................*......................... + // ldr Q, [x0, #-48] // ........................*.............................. + // pmull2 V.1q, V.2d, V.2d // ..........................*............................ + // ext V.16B, V.16B, V.16B, #8 // ...................*................................... + // aesr V.16b, v20.16b // ...............................*....................... + // ldr Q, [x6, #48] // ..........................*............................ + // eor V.16B, V.16B, V.16B // .....................*................................. + // mov D, V.d[1] // ................*...................................... + // aesr V.16b, v21.16b // .................................*..................... + // pmull V.1q, V.1d, V.1d // .........................*............................. + // rev64 V.16B, V.16B // ............................*.......................... + // aesr V.16b, v22.16b // ...................................*................... + // pmull2 V.1q, V.2d, V.2d // ..................................*.................... + // aesr V.16b, v23.16b // ......................................*................ + // pmull V.1q, V.1d, V.1d // ................................*...................... + // ldr Q, [x6, #64] // ...............................*....................... + // eor V.8B, V.8B, V.8B // ..................*.................................... + // aesr V.16b, v24.16b // .........................................*............. + // aesr V.16b, v25.16b // ...........................................*........... + // eor V.16B, V.16B, V.16B // ..................................*.................... + // ext V.16B, V.16B, V.16B, #8 // .................................*..................... + // eor V.16B, V.16B, V.16B // ....................................*.................. + // aesr V.16b, v26.16b // .............................................*......... + // pmull2 V.1q, V.2d, V.2d // ..............................*........................ + // mov D, V.d[1] // ..............................*........................ + // eor V.16B, V.16B, V.16B // .....................................*................. + // aese V.16b, v27.16b // ...............................................*....... + // eor V.8B, V.8B, V.8B // ................................*...................... + // pmull2 V.1q, V.2d, V.2d // .......................................*............... + // eor V.16B, V.16B, v28.16B // .................................................*..... + // ldr Q, [sp, #STACK_BASE_AES_ST + 16] // ....*.................................................. + // eor V.16B, V.16B, V.16B // ...................................................*... + // pmull V.1q, V.1d, V.1d // .....................*................................. + // pmull V.1q, V.1d, V.1d // ....................................*.................. + // eor V.16B, V.16B, V.16B // .............................*......................... + // aesr V.16b, v18.16b // ........*.............................................. + // eor V.16B, V.16B, V.16B // ...................................*................... + // eor V.16B, V.16B, V.16B // ......................................*................ + // aesr V.16b, v19.16b // ..........*............................................ + // aesr V.16b, v20.16b // ............*.......................................... + // eor V.16B, V.16B, V.16B // .......................................*............... + // eor V.16B, V.16B, V.16B // .........................................*............. + // aesr V.16b, v21.16b // ..............*........................................ + // eor V.16B, V.16B, V.16B // ........................................*.............. + // aesr V.16b, v22.16b // ................*...................................... + // eor V.16B, V.16B, V.16B // ...........................................*........... + // aesr V.16b, v23.16b // ..................*.................................... + // ext V.16B, V.16B, V.16B, #8 // ..........................................*............ + // aesr V.16b, v24.16b // ....................*.................................. + // pmull V.1q, V.1d, v7.1d // ........................................*.............. + // str Q, [x2, #-16] // .....................................................*. + // aesr V.16b, v25.16b // ......................*................................ + // eor V.16B, V.16B, V.16B // ............................................*.......... + // aesr V.16b, v26.16b // ........................*.............................. + // eor V.16B, V.16B, V.16B // ..............................................*........ + // aese V.16b, v27.16b // .....................................*................. + // ext V.16B, V.16B, V.16B, #8 // ................................................*...... + // eor V.16B, V.16B, v28.16B // .............................................*......... + // eor V.16B, V.16B, V.16B // ...............................................*....... + // pmull V.1q, V.1d, v7.1d // ................................................*...... + // eor V.16B, V.16B, V.16B // ..................................................*.... + // eor V.16B, V.16B, V.16B // ....................................................*.. + // str Q, [x2, #-48] // .....................................................*. + // ext v30.16B, V.16B, V.16B, #8 // ......................................................* + + b Lloop_unrolled_start_end +Lloop_unrolled_start_iter_1: + + + ldr q12, [x6] + ldr q13, [x6, #32] + ldr q14, [x6, #16] + + ldr q29, [x0, #(3*16)] + add w14, w13, #3 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:writes=stack_3 + ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:reads=stack_3 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(3*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v8.1q, v29.1d, v12.1d + // High product + pmull2 v9.1q, v29.2d, v12.2d + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v10.1q, v11.1d, v14.1d + + ldr q29, [x0, #(2*16)] + add w14, w13, #2 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:writes=stack_2 + ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:reads=stack_2 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(2*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v13.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v13.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v11.1q, v11.2d, v14.2d + eor v10.16b, v10.16b, v11.16b + + ldr q12, [x6, #48] + ldr q13, [x6, #80] + ldr q14, [x6, #64] + + ldr q29, [x0, #(1*16)] + add w14, w13, #1 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:writes=stack_1 + ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:reads=stack_1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(1*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v12.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v12.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v11.1q, v11.1d, v14.1d + eor v10.16b, v10.16b, v11.16b + + ldr q29, [x0], #(4*16) + add w14, w13, #0 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:writes=stack_0 + ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:reads=stack_0 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2], #(4*16) + + rev64 v29.16b, v29.16b + eor v29.16b, v29.16b, v30.16b + // Low product + pmull v11.1q, v29.1d, v13.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v13.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v11.1q, v11.2d, v14.2d + eor v10.16b, v10.16b, v11.16b + eor v0.16b, v8.16b, v9.16b + pmull v1.1q, v9.1d, v7.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v0.16b + eor v1.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v1.16b + pmull v9.1q, v10.1d, v7.1d + eor v8.16b, v8.16b, v9.16b + ext v10.16b, v10.16b, v10.16b, #8 + eor v30.16b, v8.16b, v10.16b + ext v30.16b, v30.16b, v30.16b, #8 + + add w13, w13, #UNROLL +Lloop_unrolled_start_iter_1_end: +Lloop_unrolled_start_end: +Lloop_unrolled_end: + + load_htable_12 + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_with_tag_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + add ctr, ctr, #1 + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https: // www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif \ No newline at end of file diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S new file mode 100644 index 00000000000..7063bb56847 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/slothy/opt/dec/aesv8-gcm-armv8-dec-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S @@ -0,0 +1,1545 @@ +// Copyright (c) 2022, ARM Inc. + +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. + +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// Author: Hanno Becker + +// This file was derived from the assembly generated from aesv8-gcm-armv8.pl, +// written by Fangming Fang for the OpenSSL project, +// and derived from https: // github.com/ARM-software/AArch64cryptolib, original +// author Samuel Lee . + +// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing +// the logic of the computation. It is meant as the input to manual audits / +// formal verification, as well as automated micro-optimization such as done +// by the SLOTHY superoptimizer (https: // github.com/slothy-optimizer/slothy). + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) +#if defined(__ELF__) +#include +#include +.arch armv8-a+crypto +.text +.globl aes_gcm_dec_kernel_slothy_base_128 +.hidden aes_gcm_dec_kernel_slothy_base_128 +.type aes_gcm_dec_kernel_slothy_base_128,%function +#elif defined(__APPLE__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include +.text +.globl _aes_gcm_dec_kernel_slothy_base_128 +.private_extern _aes_gcm_dec_kernel_slothy_base_128 +#else +#error Unknown configuration +#endif + +#if __ARM_MAX_ARCH__ >= 8 + +// Arguments +input .req x0 +len_bits .req x1 +output .req x2 +tag_ptr .req x3 +ivec .req x4 +key .req x5 +Htable .req x6 + +byte_len .req x15 + +constant_temp .req x25 + +count .req x1 +full_blocks .req x7 +remainder .req x16 +unroll .req x10 + +ctr_tmp .req x14 +ctr_tmp_w .req w14 + +ivec_0_63 .req x11 +ivec_64_96 .req x12 +ivec_64_96_w .req w12 + +ctr .req w13 +ctr_x .req x13 + +aes_st .req v0 +aes_st_q .req q0 +aes_st_d .req d0 + +res .req v0 +res_q .req q0 + +ghash_hi .req v9 +ghash_lo .req v8 +ghash_mid .req v10 +ghash_mid_d .req d10 + +ghash_tmp .req v11 +ghash_tmp_d .req d11 + +ghash_mod .req v7 +ghash_mod_d .req d7 + +modulo_tmp0 .req v0 +modulo_tmp1 .req v1 + +Ht1q .req q12 +Ht2q .req q13 +Ht12q .req q14 + +Ht1 .req v12 +Ht2 .req v13 +Ht12 .req v14 + +Ht3q .req q15 +Ht4q .req q16 +Ht34q .req q17 + +Ht3 .req v15 +Ht4 .req v16 +Ht34 .req v17 + +rk0q .req q18 +rk1q .req q19 +rk2q .req q20 +rk3q .req q21 +rk4q .req q22 +rk5q .req q23 +rk6q .req q24 +rk7q .req q25 +rk8q .req q26 +rk9q .req q27 +rk10q .req q28 + +rk0 .req v18 +rk1 .req v19 +rk2 .req v20 +rk3 .req v21 +rk4 .req v22 +rk5 .req v23 +rk6 .req v24 +rk7 .req v25 +rk8 .req v26 +rk9 .req v27 +rk10 .req v28 + +plain .req v29 +plain_q .req q29 + +tag .req v30 +tag_q .req q30 + +#define UNROLL 4 + +#define STACK_SIZE_GPRS (6*16) +#define STACK_SIZE_VREGS (4*16) +#define STACK_SIZE (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16) + +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (STACK_SIZE_GPRS) +#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS) + +/********************************************************************/ +/* Generic preamble/postamble macros */ +/********************************************************************/ + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +// Derive number of iterations of unrolled loop and single-block loop +.macro prepare_loop_counts + mov unroll, #UNROLL + // Number of AES Blocks (16b each) + lsr full_blocks, byte_len, #4 + // Number of iterations of the unrolled loop + udiv count, full_blocks, unroll + // Number of iterations for the tail loop handling 1 block each + msub remainder, count, unroll, full_blocks +.endm + +/********************************************************************/ +/* AES related macros */ +/********************************************************************/ + +.macro load_iv + ldp ivec_0_63, ivec_64_96, [ivec] + lsr ctr_x, ivec_64_96, #32 + rev ctr, ctr + orr ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit +.endm + +.macro next_ctr_init_aes aes_st, loc + add ctr_tmp_w, ctr, #\loc + rev ctr_tmp_w, ctr_tmp_w + orr ctr_tmp, ivec_64_96, ctr_tmp, lsl #32 + stp ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc + ldr \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:reads=stack_\loc +.endm + +// A single AES round +// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE +.macro aesr data, key // @slothy:no-unfold + aese \data, \key + aesmc \data, \data +.endm + +.macro aesr_0_8 data, key + aesr \data\().16b, \key\()0.16b + aesr \data\().16b, \key\()1.16b + aesr \data\().16b, \key\()2.16b + aesr \data\().16b, \key\()3.16b + aesr \data\().16b, \key\()4.16b + aesr \data\().16b, \key\()5.16b + aesr \data\().16b, \key\()6.16b + aesr \data\().16b, \key\()7.16b + aesr \data\().16b, \key\()8.16b +.endm + +.macro aesr_9_10 data, key + aesr \data\().16b, \key\()9.16b + aesr \data\().16b, \key\()10.16b +.endm + +.macro aesr_11_12 data, key + aesr \data\().16b, \key\()11.16b + aesr \data\().16b, \key\()12.16b +.endm + +// Destructs inA +.macro eor3 out, inA, inB, inC + eor \inA, \inA, \inB + eor \out, \inA, \inC +.endm + +.macro aesr_final aes_st, plain, out + aese \aes_st\().16b, rk9.16b + eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b +.endm + +.macro aes_full_block aes_st, input, output, loc + next_ctr_init_aes \aes_st, \loc + aesr_0_8 \aes_st\(), rk + aesr_final \aes_st, \input, \output +.endm + +.macro load_round_key i + ldr rk\i\()q, [key, #((\i)*16)] +.endm + +.macro load_round_keys + load_round_key 0 + load_round_key 1 + load_round_key 2 + load_round_key 3 + load_round_key 4 + load_round_key 5 + load_round_key 6 + load_round_key 7 + load_round_key 8 + load_round_key 9 + load_round_key 10 +.endm + +/********************************************************************/ +/* Loading of H-table (precomputed H-powers for GHASH) */ +/********************************************************************/ + +// This has to be synchronized with the H-table generation + +.macro load_h1 dst, dst_q + ldr \dst_q, [Htable] +.endm + +.macro load_h2 dst, dst_q + ldr \dst_q, [Htable, #32] +.endm + +.macro load_h3 dst, dst_q + ldr \dst_q, [Htable, #48] +.endm + +.macro load_h4 dst, dst_q + ldr \dst_q, [Htable, #80] +.endm + +.macro load_h5 dst, dst_q + ldr \dst_q, [Htable, #96] +.endm + +.macro load_h6 dst, dst_q + ldr \dst_q, [Htable, #128] +.endm + +.macro load_h7 dst, dst_q + ldr \dst_q, [Htable, #144] +.endm + +.macro load_h8 dst, dst_q + ldr \dst_q, [Htable, #176] +.endm + +.macro load_h12 dst, dst_q + ldr \dst_q, [Htable, #16] +.endm + +.macro load_h34 dst, dst_q + ldr \dst_q, [Htable, #64] +.endm + +.macro load_h56 dst, dst_q + ldr \dst_q, [Htable, #112] +.endm + +.macro load_h78 dst, dst_q + ldr \dst_q, [Htable, #160] +.endm + +.macro load_full_htable + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h12 Ht12, Ht12q + load_h34 Ht34, Ht34q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_12 + load_h1 Ht1, Ht1q + load_h2 Ht2, Ht2q + load_h12 Ht12, Ht12q +.endm + +.macro load_htable_34 + load_h3 Ht3, Ht3q + load_h4 Ht4, Ht4q + load_h34 Ht34, Ht34q +.endm + +.macro load_htable_56 + load_h5 Ht5, Ht5q + load_h6 Ht6, Ht6q + load_h56 Ht56, Ht56q +.endm + +.macro load_htable_78 + load_h7 Ht7, Ht7q + load_h8 Ht8, Ht8q + load_h78 Ht78, Ht78q +.endm + +/********************************************************************/ +/* Macros for GHASH udpate */ +/********************************************************************/ + +.macro ghash_init_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_0 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_1 input, Hk, Hk_mid + rev64 \input\().16b, \input\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d +.endm + +.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_lo.1q, \input\().1d, \Hk\().1d + // High product + pmull2 ghash_hi.1q, \input\().2d, \Hk\().2d + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d +.endm + +.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + mov ghash_tmp_d, \input\().d[1] + eor ghash_tmp.8b, ghash_tmp.8b, \input\().8b + pmull ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag + rev64 \input\().16b, \input\().16b + eor \input\().16b, \input\().16b, \tag\().16b + // Low product + pmull ghash_tmp.1q, \input\().1d, \Hk\().1d + eor ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b + // High product + pmull2 ghash_tmp.1q, \input\().2d, \Hk\().2d + eor ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b + // Middle product + ext ghash_tmp.16b, \input\().16b, \input\().16b, #8 + eor ghash_tmp.16b, ghash_tmp.16b, \input\().16b + pmull2 ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d + eor ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b +.endm + +.macro ghash_finalize tag + eor modulo_tmp0.16b, ghash_lo.16b, ghash_hi.16b + pmull modulo_tmp1.1q, ghash_hi.1d, ghash_mod.1d + ext ghash_hi.16b, ghash_hi.16b, ghash_hi.16b, #8 + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp0.16b + eor modulo_tmp1.16b, ghash_hi.16b, modulo_tmp1.16b + eor ghash_mid.16b, ghash_mid.16b, modulo_tmp1.16b + pmull ghash_hi.1q, ghash_mid.1d, ghash_mod.1d + eor ghash_lo.16b, ghash_lo.16b, ghash_hi.16b + ext ghash_mid.16b, ghash_mid.16b, ghash_mid.16b, #8 + eor \tag\().16b, ghash_lo.16b, ghash_mid.16b + ext \tag\().16b, \tag\().16b, \tag\().16b, #8 +.endm + +.macro load_tag + ldr tag_q, [tag_ptr] + rev64 tag.16b, tag.16b +.endm + +.macro prepare_ghash + // Prepare constant for modular reduction + movi ghash_mod.8b, #0xc2 + shl ghash_mod_d, ghash_mod_d, #56 +.endm + +/********************************************************************/ +/* Core routine */ +/********************************************************************/ + +.align 4 +_aes_gcm_dec_kernel_slothy_base_128: +aes_gcm_dec_kernel_slothy_base_128: +#ifdef BORINGSSL_DISPATCH_TEST + adrp x9,_BORINGSSL_function_hit@PAGE + add x9, x9, _BORINGSSL_function_hit@PAGEOFF + mov w10, #1 + strb w10, [x9,#2] // kFlag_aes_gcm_dec_kernel +#endif + + AARCH64_VALID_CALL_TARGET + sub sp, sp, #STACK_SIZE + +Ldec_preamble_start: + save_gprs + save_vregs + + lsr byte_len, len_bits, #3 + + load_round_keys + load_tag + load_iv + + prepare_loop_counts + prepare_ghash + + load_htable_12 + load_htable_34 + +Ldec_preamble_end: + + cbz count, Lloop_unrolled_end + cmp count, #1 + b.eq Lloop_unrolled_start_iter_1 + // Instructions: 141 + // Expected cycles: 66 + // Expected IPC: 2.14 + // + // Cycle bound: 66.0 + // IPC bound: 2.14 + // + // Wall time: 110.52s + // User time: 110.52s + // + // ----------------------- cycle (expected) ------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + ldr q11, [x0, #16] // *................................................................. + ldr q10, [x0, #32] // *................................................................. + add w24, w13, #3 // *................................................................. + add w29, w13, #2 // *................................................................. + ldr q5, [x0, #48] // .*................................................................ + rev w10, w29 // .*................................................................ + add w8, w13, #1 // .*................................................................ + add w28, w13, #0 // .*................................................................ + rev w30, w28 // ..*............................................................... + orr x29, x12, x10, lsl #32 // ..*............................................................... + rev w19, w8 // ..*............................................................... + orr x19, x12, x19, lsl #32 // ...*.............................................................. + stp x11, x29, [sp, #STACK_BASE_AES_ST + 32] // ...*.............................................................. // @slothy:writes=stack_2 + add w25, w13, #UNROLL // ...*.............................................................. + rev64 v31.16B, v10.16B // ....*............................................................. + rev64 v4.16B, v11.16B // ....*............................................................. + add w22, w25, #0 // ....*............................................................. + add w29, w25, #2 // ....*............................................................. + ldr q1, [sp, #STACK_BASE_AES_ST + 32] // .....*............................................................ // @slothy:reads=stack_2 + rev64 v6.16B, v5.16B // .....*............................................................ + stp x11, x19, [sp, #STACK_BASE_AES_ST + 16] // .....*............................................................ // @slothy:writes=stack_1 + rev w28, w29 // .....*............................................................ + pmull2 v3.1q, v4.2d, v15.2d // ......*........................................................... + ext v29.16B, v31.16B, v31.16B, #8 // ......*........................................................... + orr x19, x12, x28, lsl #32 // ......*........................................................... + add w17, w25, #1 // ......*........................................................... + ldr q0, [sp, #STACK_BASE_AES_ST + 16] // .......*.......................................................... // @slothy:reads=stack_1 + pmull2 v2.1q, v31.2d, v13.2d // .......*.......................................................... + stp x11, x19, [sp, #STACK_BASE_AES_ST + 32] // .......*.......................................................... // @slothy:writes=stack_2 + rev w17, w17 // .......*.......................................................... + eor v29.16B, v29.16B, v31.16B // ........*......................................................... + pmull v31.1q, v31.1d, v13.1d // ........*......................................................... + rev w26, w24 // ........*......................................................... + orr x14, x12, x17, lsl #32 // ........*......................................................... + aesr v1.16b, v18.16b // .........*........................................................ + mov d8, v6.d[1] // .........*........................................................ + stp x11, x14, [sp, #STACK_BASE_AES_ST + 16] // .........*........................................................ // @slothy:writes=stack_1 + orr x17, x12, x26, lsl #32 // .........*........................................................ + pmull2 v9.1q, v29.2d, v14.2d // ..........*....................................................... + mov d29, v4.d[1] // ..........*....................................................... + stp x11, x17, [sp, #STACK_BASE_AES_ST + 48] // ..........*....................................................... // @slothy:writes=stack_3 + rev w27, w22 // ..........*....................................................... + aesr v0.16b, v18.16b // ...........*...................................................... + eor v8.8B, v8.8B, v6.8B // ...........*...................................................... + orr x27, x12, x27, lsl #32 // ...........*...................................................... + orr x10, x12, x30, lsl #32 // ...........*...................................................... + eor v29.8B, v29.8B, v4.8B // ............*..................................................... + aesr v1.16b, v19.16b // ............*..................................................... + stp x11, x10, [sp, #STACK_BASE_AES_ST] // ............*..................................................... // @slothy:writes=stack_0 + aesr v0.16b, v19.16b // .............*.................................................... + pmull v8.1q, v8.1d, v14.1d // ..............*................................................... + aesr v0.16b, v20.16b // ...............*.................................................. + eor v8.16B, v8.16B, v9.16B // ................*................................................. + pmull v9.1q, v4.1d, v15.1d // ................*................................................. + aesr v0.16b, v21.16b // .................*................................................ + aesr v1.16b, v20.16b // ..................*............................................... + aesr v0.16b, v22.16b // ...................*.............................................. + aesr v1.16b, v21.16b // ....................*............................................. + aesr v0.16b, v23.16b // .....................*............................................ + pmull v29.1q, v29.1d, v17.1d // ......................*........................................... + aesr v0.16b, v24.16b // .......................*.......................................... + eor v4.16B, v8.16B, v29.16B // ........................*......................................... + pmull2 v29.1q, v6.2d, v12.2d // ........................*......................................... + ldr q8, [sp, #STACK_BASE_AES_ST + 48] // .........................*........................................ // @slothy:reads=stack_3 + aesr v0.16b, v25.16b // .........................*........................................ + eor v29.16B, v29.16B, v2.16B // ..........................*....................................... + aesr v1.16b, v22.16b // ..........................*....................................... + ldr q2, [sp, #STACK_BASE_AES_ST + 32] // ...........................*...................................... // @slothy:reads=stack_2 + aesr v0.16b, v26.16b // ...........................*...................................... + eor v3.16B, v29.16B, v3.16B // ............................*..................................... + aesr v1.16b, v23.16b // ............................*..................................... + aese v0.16b, v27.16b // .............................*.................................... + aesr v1.16b, v24.16b // ..............................*................................... + aesr v2.16b, v18.16b // ...............................*.................................. + eor v0.16B, v0.16B, v28.16B // ...............................*.................................. + aesr v1.16b, v25.16b // ................................*................................. + aesr v2.16b, v19.16b // .................................*................................ + eor v0.16B, v0.16B, v11.16B // .................................*................................ + pmull v29.1q, v6.1d, v12.1d // ..................................*............................... + ldr q11, [sp, #STACK_BASE_AES_ST] // ..................................*............................... // @slothy:reads=stack_0 + stp x11, x27, [sp, #STACK_BASE_AES_ST] // ..................................*............................... // @slothy:writes=stack_0 + aesr v2.16b, v20.16b // ...................................*.............................. + str q0, [x2, #16] // ...................................*.............................. + ldr q0, [x0], #(4*16) // ....................................*............................. + aesr v1.16b, v26.16b // ....................................*............................. + ldr q6, [sp, #STACK_BASE_AES_ST] // .....................................*............................ // @slothy:reads=stack_0 + aesr v2.16b, v21.16b // .....................................*............................ + eor v29.16B, v29.16B, v31.16B // ......................................*........................... + aese v1.16b, v27.16b // ......................................*........................... + aesr v11.16b, v18.16b // .......................................*.......................... + eor v1.16B, v1.16B, v28.16B // ........................................*......................... + aesr v2.16b, v22.16b // ........................................*......................... + aesr v11.16b, v19.16b // .........................................*........................ + rev64 v31.16B, v0.16B // .........................................*........................ + eor v1.16B, v1.16B, v10.16B // ..........................................*....................... + aesr v2.16b, v23.16b // ..........................................*....................... + aesr v11.16b, v20.16b // ...........................................*...................... + eor v10.16B, v31.16B, v30.16B // ...........................................*...................... + eor v30.16B, v29.16B, v9.16B // ............................................*..................... + aesr v2.16b, v24.16b // ............................................*..................... + str q1, [x2, #32] // .............................................*.................... + pmull2 v1.1q, v10.2d, v16.2d // .............................................*.................... + ldr q29, [sp, #STACK_BASE_AES_ST + 16] // ..............................................*................... // @slothy:reads=stack_1 + aesr v2.16b, v25.16b // ..............................................*................... + eor v1.16B, v3.16B, v1.16B // ...............................................*.................. + aesr v11.16b, v21.16b // ...............................................*.................. + ext v3.16B, v10.16B, v10.16B, #8 // ................................................*................. + aesr v2.16b, v26.16b // ................................................*................. + aesr v11.16b, v22.16b // .................................................*................ + eor v31.16B, v3.16B, v10.16B // ..................................................*............... + aese v2.16b, v27.16b // ..................................................*............... + aesr v11.16b, v23.16b // ...................................................*.............. + eor v9.16B, v2.16B, v28.16B // ....................................................*............. + pmull2 v2.1q, v31.2d, v17.2d // ....................................................*............. + aesr v11.16b, v24.16b // .....................................................*............ + eor v4.16B, v4.16B, v2.16B // ......................................................*........... + pmull v10.1q, v10.1d, v16.1d // ......................................................*........... + ext v2.16B, v1.16B, v1.16B, #8 // .......................................................*.......... + aesr v11.16b, v25.16b // .......................................................*.......... + pmull v31.1q, v1.1d, v7.1d // ........................................................*......... + eor v3.16B, v30.16B, v10.16B // ........................................................*......... + ldr q10, [x0], #(4*16) // .........................................................*........ + aesr v11.16b, v26.16b // .........................................................*........ + aesr v8.16b, v18.16b // ..........................................................*....... + eor v1.16B, v3.16B, v1.16B // ..........................................................*....... + eor v30.16B, v2.16B, v31.16B // ...........................................................*...... + aese v11.16b, v27.16b // ...........................................................*...... + eor v1.16B, v4.16B, v1.16B // ............................................................*..... + aesr v29.16b, v18.16b // ............................................................*..... + eor v31.16B, v11.16B, v28.16B // .............................................................*.... + aesr v8.16b, v19.16b // .............................................................*.... + aesr v6.16b, v18.16b // ..............................................................*... + eor v2.16B, v1.16B, v30.16B // ..............................................................*... + eor v31.16B, v31.16B, v0.16B // ...............................................................*.. + aesr v8.16b, v20.16b // ...............................................................*.. + ldr q0, [x0, #-48] // ................................................................*. + pmull v4.1q, v2.1d, v7.1d // ................................................................*. + str q31, [x2], #(4*16) // .................................................................* + aesr v8.16b, v21.16b // .................................................................* + add w7, w25, #3 // .................................................................* + add w13, w25, #UNROLL // .................................................................* + + // ----------------------- cycle (expected) ------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // add w25, w13, #1 // .*................................................................ + // add w23, w13, #0 // .*................................................................ + // rev w20, w25 // ..*............................................................... + // orr x10, x12, x20, lsl #32 // ...*.............................................................. + // rev w29, w23 // ..*............................................................... + // orr x20, x12, x29, lsl #32 // ...........*...................................................... + // add w7, w13, #3 // *................................................................. + // stp x11, x20, [sp, #STACK_BASE_AES_ST] // ............*..................................................... + // add w8, w13, #2 // *................................................................. + // add w13, w13, #UNROLL // ...*.............................................................. + // stp x11, x10, [sp, #STACK_BASE_AES_ST + 16] // .....*............................................................ + // rev w21, w8 // .*................................................................ + // orr x29, x12, x21, lsl #32 // ..*............................................................... + // stp x11, x29, [sp, #STACK_BASE_AES_ST + 32] // ...*.............................................................. + // ldr q31, [sp, #STACK_BASE_AES_ST + 32] // .....*............................................................ + // aesr v31.16b, v18.16b // .........*........................................................ + // aesr v31.16b, v19.16b // ............*..................................................... + // aesr v31.16b, v20.16b // ..................*............................................... + // aesr v31.16b, v21.16b // ....................*............................................. + // aesr v31.16b, v22.16b // ..........................*....................................... + // aesr v31.16b, v23.16b // ............................*..................................... + // aesr v31.16b, v24.16b // ..............................*................................... + // aesr v31.16b, v25.16b // ................................*................................. + // aesr v31.16b, v26.16b // ....................................*............................. + // aese v31.16b, v27.16b // ......................................*........................... + // eor v9.16B, v31.16B, v28.16B // ........................................*......................... + // ldr q29, [sp, #STACK_BASE_AES_ST + 16] // .......*.......................................................... + // ldr q6, [sp, #STACK_BASE_AES_ST] // ..................................*............................... + // aesr v29.16b, v18.16b // ...........*...................................................... + // ldr q10, [x0], #(4*16) // ....................................*............................. + // aesr v6.16b, v18.16b // .......................................*.......................... + // ldr q0, [x0, #-48] // *................................................................. + // aesr v6.16b, v19.16b // .........................................*........................ + // add w25, w13, #1 // ......*........................................................... + // rev w27, w7 // ........*......................................................... + // aesr v29.16b, v19.16b // .............*.................................................... + // add w23, w13, #0 // ....*............................................................. + // rev w20, w25 // .......*.......................................................... + // orr x17, x12, x27, lsl #32 // .........*........................................................ + // rev64 v2.16B, v0.16B // ....*............................................................. + // orr x10, x12, x20, lsl #32 // ........*......................................................... + // rev w29, w23 // ..........*....................................................... + // aesr v6.16b, v20.16b // ...........................................*...................... + // orr x20, x12, x29, lsl #32 // ...........*...................................................... + // stp x11, x17, [sp, #STACK_BASE_AES_ST + 48] // ..........*....................................................... + // add w7, w13, #3 // .................................................................* + // stp x11, x20, [sp, #STACK_BASE_AES_ST] // ..................................*............................... + // add w8, w13, #2 // ....*............................................................. + // add w13, w13, #UNROLL // .................................................................* + // aesr v29.16b, v20.16b // ...............*.................................................. + // stp x11, x10, [sp, #STACK_BASE_AES_ST + 16] // .........*........................................................ + // rev w21, w8 // .....*............................................................ + // orr x29, x12, x21, lsl #32 // ......*........................................................... + // aesr v29.16b, v21.16b // .................*................................................ + // stp x11, x29, [sp, #STACK_BASE_AES_ST + 32] // .......*.......................................................... + // ldr q1, [x0, #-32] // *................................................................. + // aesr v29.16b, v22.16b // ...................*.............................................. + // ldr q31, [sp, #STACK_BASE_AES_ST + 32] // ...........................*...................................... + // aesr v29.16b, v23.16b // .....................*............................................ + // mov d11, v2.d[1] // ..........*....................................................... + // aesr v29.16b, v24.16b // .......................*.......................................... + // aesr v31.16b, v18.16b // ...............................*.................................. + // aesr v29.16b, v25.16b // .........................*........................................ + // aesr v31.16b, v19.16b // .................................*................................ + // aesr v29.16b, v26.16b // ...........................*...................................... + // eor v4.8B, v11.8B, v2.8B // ............*..................................................... + // aesr v31.16b, v20.16b // ...................................*.............................. + // eor v5.16B, v9.16B, v1.16B // ..........................................*....................... + // aesr v6.16b, v21.16b // ...............................................*.................. + // str q5, [x2, #32] // .............................................*.................... + // aesr v31.16b, v21.16b // .....................................*............................ + // aese v29.16b, v27.16b // .............................*.................................... + // ldr q5, [x0, #-16] // .*................................................................ + // aesr v31.16b, v22.16b // ........................................*......................... + // rev64 v11.16B, v10.16B // .........................................*........................ + // eor v29.16B, v29.16B, v28.16B // ...............................*.................................. + // aesr v6.16b, v22.16b // .................................................*................ + // eor v30.16B, v11.16B, v30.16B // ...........................................*...................... + // aesr v31.16b, v23.16b // ..........................................*....................... + // eor v29.16B, v29.16B, v0.16B // .................................*................................ + // aesr v6.16b, v23.16b // ...................................................*.............. + // rev64 v8.16B, v5.16B // .....*............................................................ + // aesr v31.16b, v24.16b // ............................................*..................... + // aesr v6.16b, v24.16b // .....................................................*............ + // str q29, [x2, #16] // ...................................*.............................. + // mov d9, v8.d[1] // .........*........................................................ + // aesr v31.16b, v25.16b // ..............................................*................... + // aesr v6.16b, v25.16b // .......................................................*.......... + // eor v3.8B, v9.8B, v8.8B // ...........*...................................................... + // aesr v31.16b, v26.16b // ................................................*................. + // aesr v6.16b, v26.16b // .........................................................*........ + // aese v31.16b, v27.16b // ..................................................*............... + // rev64 v29.16B, v1.16B // ....*............................................................. + // aese v6.16b, v27.16b // ...........................................................*...... + // pmull v1.1q, v2.1d, v15.1d // ................*................................................. + // ext v0.16B, v29.16B, v29.16B, #8 // ......*........................................................... + // eor v11.16B, v6.16B, v28.16B // .............................................................*.... + // pmull v6.1q, v8.1d, v12.1d // ..................................*............................... + // eor v9.16B, v31.16B, v28.16B // ....................................................*............. + // pmull2 v2.1q, v2.2d, v15.2d // ......*........................................................... + // eor v31.16B, v11.16B, v10.16B // ...............................................................*.. + // pmull2 v11.1q, v8.2d, v12.2d // ........................*......................................... + // eor v10.16B, v0.16B, v29.16B // ........*......................................................... + // pmull v0.1q, v29.1d, v13.1d // ........*......................................................... + // str q31, [x2], #(4*16) // .................................................................* + // pmull2 v8.1q, v29.2d, v13.2d // .......*.......................................................... + // pmull v31.1q, v4.1d, v17.1d // ......................*........................................... + // eor v29.16B, v6.16B, v0.16B // ......................................*........................... + // pmull v0.1q, v3.1d, v14.1d // ..............*................................................... + // eor v4.16B, v11.16B, v8.16B // ..........................*....................................... + // eor v8.16B, v29.16B, v1.16B // ............................................*..................... + // pmull2 v3.1q, v30.2d, v16.2d // .............................................*.................... + // eor v2.16B, v4.16B, v2.16B // ............................*..................................... + // pmull2 v6.1q, v10.2d, v14.2d // ..........*....................................................... + // pmull v29.1q, v30.1d, v16.1d // ......................................................*........... + // ext v11.16B, v30.16B, v30.16B, #8 // ................................................*................. + // eor v10.16B, v2.16B, v3.16B // ...............................................*.................. + // eor v1.16B, v0.16B, v6.16B // ................*................................................. + // eor v3.16B, v8.16B, v29.16B // ........................................................*......... + // ldr q8, [sp, #STACK_BASE_AES_ST + 48] // .........................*........................................ + // ldr q29, [sp, #STACK_BASE_AES_ST + 16] // ..............................................*................... + // eor v11.16B, v11.16B, v30.16B // ..................................................*............... + // pmull v2.1q, v10.1d, v7.1d // ........................................................*......... + // eor v6.16B, v1.16B, v31.16B // ........................*......................................... + // ext v0.16B, v10.16B, v10.16B, #8 // .......................................................*.......... + // pmull2 v30.1q, v11.2d, v17.2d // ....................................................*............. + // eor v31.16B, v3.16B, v10.16B // ..........................................................*....... + // aesr v8.16b, v18.16b // ..........................................................*....... + // eor v1.16B, v6.16B, v30.16B // ......................................................*........... + // ldr q6, [sp, #STACK_BASE_AES_ST] // .....................................*............................ + // eor v11.16B, v0.16B, v2.16B // ...........................................................*...... + // aesr v8.16b, v19.16b // .............................................................*.... + // aesr v29.16b, v18.16b // ............................................................*..... + // eor v30.16B, v1.16B, v31.16B // ............................................................*..... + // ldr q10, [x0], #(4*16) // .........................................................*........ + // aesr v8.16b, v20.16b // ...............................................................*.. + // aesr v6.16b, v18.16b // ..............................................................*... + // eor v2.16B, v30.16B, v11.16B // ..............................................................*... + // aesr v8.16b, v21.16b // .................................................................* + // ldr q0, [x0, #-48] // ................................................................*. + // pmull v4.1q, v2.1d, v7.1d // ................................................................*. + + sub count, count, #2 +cbz count, Lloop_unrolled_start_postamble +Lloop_unrolled_start: + // Instructions: 122 + // Expected cycles: 58 + // Expected IPC: 2.10 + // + // Cycle bound: 54.0 + // IPC bound: 2.26 + // + // Wall time: 3601.90s + // User time: 3601.90s + // + // ------------------- cycle (expected) --------------------> + // 0 25 50 + // |------------------------|------------------------|------- + ext v11.16B, v2.16B, v2.16B, #8 // l......................................................... + aesr v6.16b, v19.16b // *......................................................... + add w25, w13, #1 // e......................................................... + rev w27, w7 // *......................................................... + aesr v29.16b, v19.16b // .*........................................................ + add w23, w13, #0 // .e........................................................ + rev w20, w25 // .e........................................................ + orr x17, x12, x27, lsl #32 // .*........................................................ + aesr v8.16b, v22.16b // ..l....................................................... + rev64 v2.16B, v0.16B // ..*....................................................... + orr x10, x12, x20, lsl #32 // ..e....................................................... + rev w29, w23 // ..e....................................................... + aesr v6.16b, v20.16b // ...*...................................................... + orr x20, x12, x29, lsl #32 // ...e...................................................... + stp x11, x17, [sp, #STACK_BASE_AES_ST + 48] // ...*...................................................... // @slothy:writes=stack_3 + add w7, w13, #3 // ...e...................................................... + aesr v8.16b, v23.16b // ....l..................................................... + stp x11, x20, [sp, #STACK_BASE_AES_ST] // ....e..................................................... // @slothy:writes=stack_0 + add w8, w13, #2 // ....e..................................................... + add w13, w13, #UNROLL // ....e..................................................... + aesr v29.16b, v20.16b // .....*.................................................... + eor v1.16B, v3.16B, v4.16B // .....l.................................................... + stp x11, x10, [sp, #STACK_BASE_AES_ST + 16] // .....e.................................................... // @slothy:writes=stack_1 + rev w21, w8 // .....e.................................................... + aesr v8.16b, v24.16b // ......l................................................... + orr x29, x12, x21, lsl #32 // ......e................................................... + eor v11.16B, v1.16B, v11.16B // .......l.................................................. + aesr v29.16b, v21.16b // .......*.................................................. + stp x11, x29, [sp, #STACK_BASE_AES_ST + 32] // .......e.................................................. // @slothy:writes=stack_2 + ldr q1, [x0, #-32] // ........*................................................. + aesr v8.16b, v25.16b // ........l................................................. + aesr v29.16b, v22.16b // .........*................................................ + ldr q31, [sp, #STACK_BASE_AES_ST + 32] // .........e................................................ // @slothy:reads=stack_2 + aesr v8.16b, v26.16b // ..........l............................................... + ext v30.16B, v11.16B, v11.16B, #8 // ..........l............................................... + aesr v29.16b, v23.16b // ...........*.............................................. + mov d11, v2.d[1] // ...........*.............................................. + aese v8.16b, v27.16b // ............l............................................. + aesr v29.16b, v24.16b // .............*............................................ + aesr v31.16b, v18.16b // ..............e........................................... + eor v4.16B, v8.16B, v28.16B // ..............l........................................... + aesr v29.16b, v25.16b // ...............*.......................................... + eor v8.16B, v4.16B, v5.16B // ................l......................................... + aesr v31.16b, v19.16b // ................e......................................... + aesr v29.16b, v26.16b // .................*........................................ + eor v4.8B, v11.8B, v2.8B // .................*........................................ + aesr v31.16b, v20.16b // ..................e....................................... + eor v5.16B, v9.16B, v1.16B // ..................*....................................... + aesr v6.16b, v21.16b // ...................*...................................... + str q8, [x2, #-16] // ...................l...................................... + str q5, [x2, #32] // ....................*..................................... + aesr v31.16b, v21.16b // ....................e..................................... + aese v29.16b, v27.16b // .....................*.................................... + ldr q5, [x0, #-16] // .....................*.................................... + aesr v31.16b, v22.16b // ......................e................................... + rev64 v11.16B, v10.16B // ......................*................................... + eor v29.16B, v29.16B, v28.16B // .......................*.................................. + aesr v6.16b, v22.16b // .......................*.................................. + eor v30.16B, v11.16B, v30.16B // ........................*................................. + aesr v31.16b, v23.16b // ........................e................................. + eor v29.16B, v29.16B, v0.16B // .........................*................................ + aesr v6.16b, v23.16b // .........................*................................ + rev64 v8.16B, v5.16B // ..........................*............................... + aesr v31.16b, v24.16b // ..........................e............................... + aesr v6.16b, v24.16b // ...........................*.............................. + str q29, [x2, #16] // ...........................*.............................. + mov d9, v8.d[1] // ............................*............................. + aesr v31.16b, v25.16b // ............................e............................. + aesr v6.16b, v25.16b // .............................*............................ + eor v3.8B, v9.8B, v8.8B // ..............................*........................... + aesr v31.16b, v26.16b // ..............................e........................... + aesr v6.16b, v26.16b // ...............................*.......................... + aese v31.16b, v27.16b // ................................e......................... + rev64 v29.16B, v1.16B // ................................*......................... + aese v6.16b, v27.16b // .................................*........................ + pmull v1.1q, v2.1d, v15.1d // ..................................*....................... + ext v0.16B, v29.16B, v29.16B, #8 // ..................................*....................... + eor v11.16B, v6.16B, v28.16B // ...................................*...................... + pmull v6.1q, v8.1d, v12.1d // ...................................*...................... + eor v9.16B, v31.16B, v28.16B // ....................................e..................... + pmull2 v2.1q, v2.2d, v15.2d // ....................................*..................... + eor v31.16B, v11.16B, v10.16B // .....................................*.................... + pmull2 v11.1q, v8.2d, v12.2d // .....................................*.................... + eor v10.16B, v0.16B, v29.16B // ......................................*................... + pmull v0.1q, v29.1d, v13.1d // ......................................*................... + str q31, [x2], #(4*16) // .......................................*.................. + pmull2 v8.1q, v29.2d, v13.2d // .......................................*.................. + pmull v31.1q, v4.1d, v17.1d // ........................................*................. + eor v29.16B, v6.16B, v0.16B // ........................................*................. + pmull v0.1q, v3.1d, v14.1d // .........................................*................ + eor v4.16B, v11.16B, v8.16B // .........................................*................ + eor v8.16B, v29.16B, v1.16B // ..........................................*............... + pmull2 v3.1q, v30.2d, v16.2d // ..........................................*............... + eor v2.16B, v4.16B, v2.16B // ...........................................*.............. + pmull2 v6.1q, v10.2d, v14.2d // ...........................................*.............. + pmull v29.1q, v30.1d, v16.1d // ............................................*............. + ext v11.16B, v30.16B, v30.16B, #8 // ............................................*............. + eor v10.16B, v2.16B, v3.16B // .............................................*............ + eor v1.16B, v0.16B, v6.16B // .............................................*............ + eor v3.16B, v8.16B, v29.16B // ..............................................*........... + ldr q8, [sp, #STACK_BASE_AES_ST + 48] // ..............................................*........... // @slothy:reads=stack_3 + ldr q29, [sp, #STACK_BASE_AES_ST + 16] // ...............................................e.......... // @slothy:reads=stack_1 + eor v11.16B, v11.16B, v30.16B // ...............................................*.......... + pmull v2.1q, v10.1d, v7.1d // ................................................*......... + eor v6.16B, v1.16B, v31.16B // ................................................*......... + ext v0.16B, v10.16B, v10.16B, #8 // .................................................*........ + pmull2 v30.1q, v11.2d, v17.2d // .................................................*........ + eor v31.16B, v3.16B, v10.16B // ..................................................*....... + aesr v8.16b, v18.16b // ..................................................*....... + eor v1.16B, v6.16B, v30.16B // ...................................................*...... + ldr q6, [sp, #STACK_BASE_AES_ST] // ...................................................e...... // @slothy:reads=stack_0 + eor v11.16B, v0.16B, v2.16B // ....................................................*..... + aesr v8.16b, v19.16b // ....................................................*..... + aesr v29.16b, v18.16b // .....................................................e.... + eor v30.16B, v1.16B, v31.16B // .....................................................*.... + ldr q10, [x0], #(4*16) // ......................................................e... + aesr v8.16b, v20.16b // ......................................................*... + aesr v6.16b, v18.16b // .......................................................e.. + eor v2.16B, v30.16B, v11.16B // .......................................................*.. + aesr v8.16b, v21.16b // ........................................................*. + ldr q0, [x0, #-48] // ........................................................e. + pmull v4.1q, v2.1d, v7.1d // .........................................................* + + // ---------------------------------------------------------- cycle (expected) -----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------- + // ldr q29, [x0, #(3*16)] // .....................~....................................'....................*....................................'................... + // add w14, w13, #3 // ...e......................................................'..~......................................................'..~................ + // rev w14, w14 // ~.........................................................*.........................................................~................... + // orr x14, x12, x14, lsl #32 // .~........................................................'*........................................................'~.................. + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 3*16)] // ...~......................................................'..*......................................................'..~................ + // ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // ..............................................~...........'.............................................*...........'................... + // aesr v0.16b, v18.16b // ..................................................~.......'.................................................*.......'................... + // aesr v0.16b, v19.16b // ....................................................~.....'...................................................*.....'................... + // aesr v0.16b, v20.16b // ......................................................~...'.....................................................*...'................... + // aesr v0.16b, v21.16b // ........................................................~.'.......................................................*.'................... + // aesr v0.16b, v22.16b // ..~.......................................................'.~.......................................................'.l................. + // aesr v0.16b, v23.16b // ....~.....................................................'...~.....................................................'...l............... + // aesr v0.16b, v24.16b // ......~...................................................'.....~...................................................'.....l............. + // aesr v0.16b, v25.16b // ........~.................................................'.......~.................................................'.......l........... + // aesr v0.16b, v26.16b // ..........~...............................................'.........~...............................................'.........l......... + // aese v0.16b, v27.16b // ............~.............................................'...........~.............................................'...........l....... + // eor v0.16b, v0.16b, v28.16b // ..............~...........................................'.............~...........................................'.............l..... + // eor v0.16b, v0.16b, v29.16b // ................~.........................................'...............~.........................................'...............l... + // str q0, [x2, #(3*16)] // ...................~......................................'..................~......................................'..................l + // rev64 v29.16b, v29.16b // ..........................~...............................'.........................*...............................'................... + // pmull v8.1q, v29.1d, v12.1d // ...................................~......................'..................................*......................'................... + // pmull2 v9.1q, v29.2d, v12.2d // .....................................~....................'....................................*....................'................... + // mov d11, v29.d[1] // ............................~.............................'...........................*.............................'................... + // eor v11.8b, v11.8b, v29.8b // ..............................~...........................'.............................*...........................'................... + // pmull v10.1q, v11.1d, v14.1d // .........................................~................'........................................*................'................... + // ldr q29, [x0, #(2*16)] // ........~.................................................'.......*.................................................'.......~........... + // add w14, w13, #2 // ....e.....................................................'...~.....................................................'...~............... + // rev w14, w14 // .....e....................................................'....~....................................................'....~.............. + // orr x14, x12, x14, lsl #32 // ......e...................................................'.....~...................................................'.....~............. + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 2*16)] // .......e..................................................'......~..................................................'......~............ + // ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // .........e................................................'........~................................................'........~.......... + // aesr v0.16b, v18.16b // ..............e...........................................'.............~...........................................'.............~..... + // aesr v0.16b, v19.16b // ................e.........................................'...............~.........................................'...............~... + // aesr v0.16b, v20.16b // ..................e.......................................'.................~.......................................'.................~. + // aesr v0.16b, v21.16b // ....................e.....................................'...................~.....................................'................... + // aesr v0.16b, v22.16b // ......................e...................................'.....................~...................................'................... + // aesr v0.16b, v23.16b // ........................e.................................'.......................~.................................'................... + // aesr v0.16b, v24.16b // ..........................e...............................'.........................~...............................'................... + // aesr v0.16b, v25.16b // ............................e.............................'...........................~.............................'................... + // aesr v0.16b, v26.16b // ..............................e...........................'.............................~...........................'................... + // aese v0.16b, v27.16b // ................................e.........................'...............................~.........................'................... + // eor v0.16b, v0.16b, v28.16b // ....................................e.....................'...................................~.....................'................... + // eor v0.16b, v0.16b, v29.16b // ..................~.......................................'.................*.......................................'.................~. + // str q0, [x2, #(2*16)] // ....................~.....................................'...................*.....................................'................... + // rev64 v29.16b, v29.16b // ................................~.........................'...............................*.........................'................... + // pmull v11.1q, v29.1d, v13.1d // ......................................~...................'.....................................*...................'................... + // eor v8.16b, v8.16b, v11.16b // ........................................~.................'.......................................*.................'................... + // pmull2 v11.1q, v29.2d, v13.2d // .......................................~..................'......................................*..................'................... + // eor v9.16b, v9.16b, v11.16b // .........................................~................'........................................*................'................... + // ext v11.16b, v29.16b, v29.16b, #8 // ..................................~.......................'.................................*.......................'................... + // eor v11.16b, v11.16b, v29.16b // ......................................~...................'.....................................*...................'................... + // pmull2 v11.1q, v11.2d, v14.2d // ...........................................~..............'..........................................*..............'................... + // eor v10.16b, v10.16b, v11.16b // .............................................~............'............................................*............'................... + // ldr q29, [x0, #(1*16)] // ........................................................e.'.......................................................~.'................... + // add w14, w13, #1 // e.........................................................~.........................................................~................... + // rev w14, w14 // .e........................................................'~........................................................'~.................. + // orr x14, x12, x14, lsl #32 // ..e.......................................................'.~.......................................................'.~................. + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 1*16)] // .....e....................................................'....~....................................................'....~.............. + // ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // ...............................................e..........'..............................................~..........'................... + // aesr v0.16b, v18.16b // .....................................................e....'....................................................~....'................... + // aesr v0.16b, v19.16b // .~........................................................'*........................................................'~.................. + // aesr v0.16b, v20.16b // .....~....................................................'....*....................................................'....~.............. + // aesr v0.16b, v21.16b // .......~..................................................'......*..................................................'......~............ + // aesr v0.16b, v22.16b // .........~................................................'........*................................................'........~.......... + // aesr v0.16b, v23.16b // ...........~..............................................'..........*..............................................'..........~........ + // aesr v0.16b, v24.16b // .............~............................................'............*............................................'............~...... + // aesr v0.16b, v25.16b // ...............~..........................................'..............*..........................................'..............~.... + // aesr v0.16b, v26.16b // .................~........................................'................*........................................'................~.. + // aese v0.16b, v27.16b // .....................~....................................'....................*....................................'................... + // eor v0.16b, v0.16b, v28.16b // .......................~..................................'......................*..................................'................... + // eor v0.16b, v0.16b, v29.16b // .........................~................................'........................*................................'................... + // str q0, [x2, #(1*16)] // ...........................~..............................'..........................*..............................'................... + // rev64 v29.16b, v29.16b // ..~.......................................................'.*.......................................................'.~................. + // pmull v11.1q, v29.1d, v15.1d // ..................................~.......................'.................................*.......................'................... + // eor v8.16b, v8.16b, v11.16b // ..........................................~...............'.........................................*...............'................... + // pmull2 v11.1q, v29.2d, v15.2d // ....................................~.....................'...................................*.....................'................... + // eor v9.16b, v9.16b, v11.16b // ...........................................~..............'..........................................*..............'................... + // mov d11, v29.d[1] // ...........~..............................................'..........*..............................................'..........~........ + // eor v11.8b, v11.8b, v29.8b // .................~........................................'................*........................................'................~.. + // pmull v11.1q, v11.1d, v17.1d // ........................................~.................'.......................................*.................'................... + // eor v10.16b, v10.16b, v11.16b // ................................................~.........'...............................................*.........'................... + // ldr q29, [x0], #(4*16) // ......................................................e...'.....................................................~...'................... + // add w14, w13, #0 // .e........................................................'~........................................................'~.................. + // rev w14, w14 // ..e.......................................................'.~.......................................................'.~................. + // orr x14, x12, x14, lsl #32 // ...e......................................................'..~......................................................'..~................ + // stp x11, x14, [sp, #(STACK_BASE_AES_ST + 0*16)] // ....e.....................................................'...~.....................................................'...~............... + // ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // ...................................................e......'..................................................~......'................... + // aesr v0.16b, v18.16b // .......................................................e..'......................................................~..'................... + // aesr v0.16b, v19.16b // ~.........................................................*.........................................................~................... + // aesr v0.16b, v20.16b // ...~......................................................'..*......................................................'..~................ + // aesr v0.16b, v21.16b // ...................~......................................'..................*......................................'................... + // aesr v0.16b, v22.16b // .......................~..................................'......................*..................................'................... + // aesr v0.16b, v23.16b // .........................~................................'........................*................................'................... + // aesr v0.16b, v24.16b // ...........................~..............................'..........................*..............................'................... + // aesr v0.16b, v25.16b // .............................~............................'............................*............................'................... + // aesr v0.16b, v26.16b // ...............................~..........................'..............................*..........................'................... + // aese v0.16b, v27.16b // .................................~........................'................................*........................'................... + // eor v0.16b, v0.16b, v28.16b // ...................................~......................'..................................*......................'................... + // eor v0.16b, v0.16b, v29.16b // .....................................~....................'....................................*....................'................... + // str q0, [x2], #(4*16) // .......................................~..................'......................................*..................'................... + // rev64 v29.16b, v29.16b // ......................~...................................'.....................*...................................'................... + // eor v29.16b, v29.16b, v30.16b // ........................~.................................'.......................*.................................'................... + // pmull v11.1q, v29.1d, v16.1d // ............................................~.............'...........................................*.............'................... + // eor v8.16b, v8.16b, v11.16b // ..............................................~...........'.............................................*...........'................... + // pmull2 v11.1q, v29.2d, v16.2d // ..........................................~...............'.........................................*...............'................... + // eor v9.16b, v9.16b, v11.16b // .............................................~............'............................................*............'................... + // ext v11.16b, v29.16b, v29.16b, #8 // ............................................~.............'...........................................*.............'................... + // eor v11.16b, v11.16b, v29.16b // ...............................................~..........'..............................................*..........'................... + // pmull2 v11.1q, v11.2d, v17.2d // .................................................~........'................................................*........'................... + // eor v10.16b, v10.16b, v11.16b // ...................................................~......'..................................................*......'................... + // eor v0.16b, v8.16b, v9.16b // ..................................................~.......'.................................................*.......'................... + // pmull v1.1q, v9.1d, v7.1d // ................................................~.........'...............................................*.........'................... + // ext v9.16b, v9.16b, v9.16b, #8 // .................................................~........'................................................*........'................... + // eor v10.16b, v10.16b, v0.16b // .....................................................~....'....................................................*....'................... + // eor v1.16b, v9.16b, v1.16b // ....................................................~.....'...................................................*.....'................... + // eor v10.16b, v10.16b, v1.16b // .......................................................~..'......................................................*..'................... + // pmull v9.1q, v10.1d, v7.1d // .........................................................~'........................................................*'................... + // eor v8.16b, v8.16b, v9.16b // .....~....................................................'....~....................................................'....l.............. + // ext v10.16b, v10.16b, v10.16b, #8 // ~.........................................................~.........................................................l................... + // eor v30.16b, v8.16b, v10.16b // .......~..................................................'......~..................................................'......l............ + // ext v30.16b, v30.16b, v30.16b, #8 // ..........~...............................................'.........~...............................................'.........l......... + // add w13, w13, #UNROLL // ....e.....................................................'...~.....................................................'...~............... + + sub count, count, #1 + cbnz count, Lloop_unrolled_start +Lloop_unrolled_start_postamble:// end of loop kernel + // Instructions: 103 + // Expected cycles: 54 + // Expected IPC: 1.91 + // + // Cycle bound: 54.0 + // IPC bound: 1.91 + // + // Wall time: 13.16s + // User time: 13.16s + // + // ----------------- cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|--- + aesr v6.16b, v19.16b // *..................................................... + rev w29, w7 // *..................................................... + aesr v8.16b, v22.16b // .*.................................................... + ext v1.16B, v2.16B, v2.16B, #8 // .*.................................................... + orr x29, x12, x29, lsl #32 // .*.................................................... + aesr v6.16b, v20.16b // ..*................................................... + eor v30.16B, v3.16B, v4.16B // ..*................................................... + stp x11, x29, [sp, #STACK_BASE_AES_ST + 48] // ..*................................................... // @slothy:writes=stack_3 + aesr v8.16b, v23.16b // ...*.................................................. + ldr q31, [x0, #-16] // ...*.................................................. + eor v30.16B, v30.16B, v1.16B // ....*................................................. + aesr v6.16b, v21.16b // ....*................................................. + aesr v8.16b, v24.16b // .....*................................................ + rev64 v4.16B, v10.16B // .....*................................................ + aesr v6.16b, v22.16b // ......*............................................... + ext v3.16B, v30.16B, v30.16B, #8 // ......*............................................... + aesr v8.16b, v25.16b // .......*.............................................. + rev64 v30.16B, v31.16B // .......*.............................................. + aesr v6.16b, v23.16b // ........*............................................. + mov d2, v30.d[1] // .........*............................................ + aesr v8.16b, v26.16b // .........*............................................ + aesr v6.16b, v24.16b // ..........*........................................... + eor v4.16B, v4.16B, v3.16B // ..........*........................................... + eor v11.8B, v2.8B, v30.8B // ...........*.......................................... + aese v8.16b, v27.16b // ...........*.......................................... + aesr v6.16b, v25.16b // ............*......................................... + ext v2.16B, v4.16B, v4.16B, #8 // ............*......................................... + pmull v3.1q, v11.1d, v14.1d // .............*........................................ + eor v11.16B, v8.16B, v28.16B // .............*........................................ + aesr v6.16b, v26.16b // ..............*....................................... + eor v2.16B, v2.16B, v4.16B // ..............*....................................... + aesr v29.16b, v19.16b // ...............*...................................... + eor v11.16B, v11.16B, v5.16B // ...............*...................................... + aese v6.16b, v27.16b // ................*..................................... + ldr q5, [x0, #-32] // ................*..................................... + aesr v29.16b, v20.16b // .................*.................................... + str q11, [x2, #-16] // .................*.................................... + pmull2 v1.1q, v2.2d, v17.2d // ..................*................................... + eor v2.16B, v6.16B, v28.16B // ..................*................................... + ldr q6, [sp, #STACK_BASE_AES_ST + 48] // ...................*.................................. // @slothy:reads=stack_3 + pmull v11.1q, v30.1d, v12.1d // ...................*.................................. + eor v10.16B, v2.16B, v10.16B // ....................*................................. + aesr v29.16b, v21.16b // ....................*................................. + pmull2 v8.1q, v4.2d, v16.2d // .....................*................................ + rev64 v2.16B, v5.16B // ......................*............................... + aesr v29.16b, v22.16b // ......................*............................... + eor v5.16B, v9.16B, v5.16B // .......................*.............................. + aesr v6.16b, v18.16b // .......................*.............................. + str q10, [x2], #(4*16) // ........................*............................. + aesr v29.16b, v23.16b // ........................*............................. + str q5, [x2, #-32] // .........................*............................ + pmull v10.1q, v2.1d, v13.1d // .........................*............................ + pmull v5.1q, v4.1d, v16.1d // ..........................*........................... + ext v4.16B, v2.16B, v2.16B, #8 // ..........................*........................... + pmull2 v30.1q, v30.2d, v12.2d // ...........................*.......................... + eor v9.16B, v11.16B, v10.16B // ...........................*.......................... + rev64 v11.16B, v0.16B // ............................*......................... + aesr v6.16b, v19.16b // ............................*......................... + eor v10.16B, v4.16B, v2.16B // .............................*........................ + pmull2 v4.1q, v2.2d, v13.2d // .............................*........................ + aesr v6.16b, v20.16b // ..............................*....................... + mov d2, v11.d[1] // ..............................*....................... + eor v30.16B, v30.16B, v4.16B // ...............................*...................... + pmull2 v4.1q, v11.2d, v15.2d // ...............................*...................... + eor v2.8B, v2.8B, v11.8B // ................................*..................... + pmull v11.1q, v11.1d, v15.1d // ................................*..................... + pmull2 v10.1q, v10.2d, v14.2d // .................................*.................... + eor v4.16B, v30.16B, v4.16B // .................................*.................... + eor v30.16B, v9.16B, v11.16B // ..................................*................... + aesr v6.16b, v21.16b // ..................................*................... + eor v10.16B, v3.16B, v10.16B // ...................................*.................. + pmull v2.1q, v2.1d, v17.1d // ...................................*.................. + eor v5.16B, v30.16B, v5.16B // ....................................*................. + aesr v6.16b, v22.16b // ....................................*................. + aesr v29.16b, v24.16b // .....................................*................ + eor v9.16B, v10.16B, v2.16B // .....................................*................ + eor v10.16B, v4.16B, v8.16B // ......................................*............... + aesr v6.16b, v23.16b // ......................................*............... + eor v8.16B, v9.16B, v1.16B // .......................................*.............. + aesr v29.16b, v25.16b // .......................................*.............. + aesr v6.16b, v24.16b // ........................................*............. + eor v11.16B, v5.16B, v10.16B // ........................................*............. + ext v9.16B, v10.16B, v10.16B, #8 // .........................................*............ + pmull v10.1q, v10.1d, v7.1d // .........................................*............ + eor v11.16B, v8.16B, v11.16B // ..........................................*........... + aesr v6.16b, v25.16b // ..........................................*........... + aesr v29.16b, v26.16b // ...........................................*.......... + eor v10.16B, v9.16B, v10.16B // ...........................................*.......... + aesr v6.16b, v26.16b // ............................................*......... + eor v11.16B, v11.16B, v10.16B // .............................................*........ + aese v29.16b, v27.16b // .............................................*........ + aese v6.16b, v27.16b // ..............................................*....... + ext v4.16B, v11.16B, v11.16B, #8 // ...............................................*...... + pmull v11.1q, v11.1d, v7.1d // ...............................................*...... + eor v30.16B, v6.16B, v28.16B // ................................................*..... + eor v9.16B, v5.16B, v11.16B // .................................................*.... + eor v8.16B, v29.16B, v28.16B // .................................................*.... + eor v30.16B, v30.16B, v31.16B // ..................................................*... + eor v5.16B, v9.16B, v4.16B // ...................................................*.. + eor v9.16B, v8.16B, v0.16B // ...................................................*.. + str q30, [x2, #-16] // ....................................................*. + ext v30.16B, v5.16B, v5.16B, #8 // .....................................................* + str q9, [x2, #-48] // .....................................................* + + // ----------------- cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|--- + // ext v11.16B, v2.16B, v2.16B, #8 // .*.................................................... + // aesr v6.16b, v19.16b // *..................................................... + // rev w27, w7 // *..................................................... + // aesr v29.16b, v19.16b // ...............*...................................... + // orr x17, x12, x27, lsl #32 // .*.................................................... + // aesr v8.16b, v22.16b // .*.................................................... + // rev64 v2.16B, v0.16B // ............................*......................... + // aesr v6.16b, v20.16b // ..*................................................... + // stp x11, x17, [sp, #STACK_BASE_AES_ST + 48] // ..*................................................... + // aesr v8.16b, v23.16b // ...*.................................................. + // aesr v29.16b, v20.16b // .................*.................................... + // eor v1.16B, v3.16B, v4.16B // ..*................................................... + // aesr v8.16b, v24.16b // .....*................................................ + // eor v11.16B, v1.16B, v11.16B // ....*................................................. + // aesr v29.16b, v21.16b // ....................*................................. + // ldr q1, [x0, #-32] // ................*..................................... + // aesr v8.16b, v25.16b // .......*.............................................. + // aesr v29.16b, v22.16b // ......................*............................... + // aesr v8.16b, v26.16b // .........*............................................ + // ext v30.16B, v11.16B, v11.16B, #8 // ......*............................................... + // aesr v29.16b, v23.16b // ........................*............................. + // mov d11, v2.d[1] // ..............................*....................... + // aese v8.16b, v27.16b // ...........*.......................................... + // aesr v29.16b, v24.16b // .....................................*................ + // eor v4.16B, v8.16B, v28.16B // .............*........................................ + // aesr v29.16b, v25.16b // .......................................*.............. + // eor v8.16B, v4.16B, v5.16B // ...............*...................................... + // aesr v29.16b, v26.16b // ...........................................*.......... + // eor v4.8B, v11.8B, v2.8B // ................................*..................... + // eor v5.16B, v9.16B, v1.16B // .......................*.............................. + // aesr v6.16b, v21.16b // ....*................................................. + // str q8, [x2, #-16] // .................*.................................... + // str q5, [x2, #32] // .........................*............................ + // aese v29.16b, v27.16b // .............................................*........ + // ldr q5, [x0, #-16] // ...*.................................................. + // rev64 v11.16B, v10.16B // .....*................................................ + // eor v29.16B, v29.16B, v28.16B // .................................................*.... + // aesr v6.16b, v22.16b // ......*............................................... + // eor v30.16B, v11.16B, v30.16B // ..........*........................................... + // eor v29.16B, v29.16B, v0.16B // ...................................................*.. + // aesr v6.16b, v23.16b // ........*............................................. + // rev64 v8.16B, v5.16B // .......*.............................................. + // aesr v6.16b, v24.16b // ..........*........................................... + // str q29, [x2, #16] // .....................................................* + // mov d9, v8.d[1] // .........*............................................ + // aesr v6.16b, v25.16b // ............*......................................... + // eor v3.8B, v9.8B, v8.8B // ...........*.......................................... + // aesr v6.16b, v26.16b // ..............*....................................... + // rev64 v29.16B, v1.16B // ......................*............................... + // aese v6.16b, v27.16b // ................*..................................... + // pmull v1.1q, v2.1d, v15.1d // ................................*..................... + // ext v0.16B, v29.16B, v29.16B, #8 // ..........................*........................... + // eor v11.16B, v6.16B, v28.16B // ..................*................................... + // pmull v6.1q, v8.1d, v12.1d // ...................*.................................. + // pmull2 v2.1q, v2.2d, v15.2d // ...............................*...................... + // eor v31.16B, v11.16B, v10.16B // ....................*................................. + // pmull2 v11.1q, v8.2d, v12.2d // ...........................*.......................... + // eor v10.16B, v0.16B, v29.16B // .............................*........................ + // pmull v0.1q, v29.1d, v13.1d // .........................*............................ + // str q31, [x2], #(4*16) // ........................*............................. + // pmull2 v8.1q, v29.2d, v13.2d // .............................*........................ + // pmull v31.1q, v4.1d, v17.1d // ...................................*.................. + // eor v29.16B, v6.16B, v0.16B // ...........................*.......................... + // pmull v0.1q, v3.1d, v14.1d // .............*........................................ + // eor v4.16B, v11.16B, v8.16B // ...............................*...................... + // eor v8.16B, v29.16B, v1.16B // ..................................*................... + // pmull2 v3.1q, v30.2d, v16.2d // .....................*................................ + // eor v2.16B, v4.16B, v2.16B // .................................*.................... + // pmull2 v6.1q, v10.2d, v14.2d // .................................*.................... + // pmull v29.1q, v30.1d, v16.1d // ..........................*........................... + // ext v11.16B, v30.16B, v30.16B, #8 // ............*......................................... + // eor v10.16B, v2.16B, v3.16B // ......................................*............... + // eor v1.16B, v0.16B, v6.16B // ...................................*.................. + // eor v3.16B, v8.16B, v29.16B // ....................................*................. + // ldr q8, [sp, #STACK_BASE_AES_ST + 48] // ...................*.................................. + // eor v11.16B, v11.16B, v30.16B // ..............*....................................... + // pmull v2.1q, v10.1d, v7.1d // .........................................*............ + // eor v6.16B, v1.16B, v31.16B // .....................................*................ + // ext v0.16B, v10.16B, v10.16B, #8 // .........................................*............ + // pmull2 v30.1q, v11.2d, v17.2d // ..................*................................... + // eor v31.16B, v3.16B, v10.16B // ........................................*............. + // aesr v8.16b, v18.16b // .......................*.............................. + // eor v1.16B, v6.16B, v30.16B // .......................................*.............. + // eor v11.16B, v0.16B, v2.16B // ...........................................*.......... + // aesr v8.16b, v19.16b // ............................*......................... + // eor v30.16B, v1.16B, v31.16B // ..........................................*........... + // aesr v8.16b, v20.16b // ..............................*....................... + // eor v2.16B, v30.16B, v11.16B // .............................................*........ + // aesr v8.16b, v21.16b // ..................................*................... + // pmull v4.1q, v2.1d, v7.1d // ...............................................*...... + // ext v11.16B, v2.16B, v2.16B, #8 // ...............................................*...... + // aesr v8.16b, v22.16b // ....................................*................. + // aesr v8.16b, v23.16b // ......................................*............... + // eor v1.16B, v3.16B, v4.16B // .................................................*.... + // aesr v8.16b, v24.16b // ........................................*............. + // eor v11.16B, v1.16B, v11.16B // ...................................................*.. + // aesr v8.16b, v25.16b // ..........................................*........... + // aesr v8.16b, v26.16b // ............................................*......... + // ext v30.16B, v11.16B, v11.16B, #8 // .....................................................* + // aese v8.16b, v27.16b // ..............................................*....... + // eor v4.16B, v8.16B, v28.16B // ................................................*..... + // eor v8.16B, v4.16B, v5.16B // ..................................................*... + // str q8, [x2, #-16] // ....................................................*. + + b Lloop_unrolled_start_end +Lloop_unrolled_start_iter_1: + + + ldr q29, [x0, #(3*16)] + add w14, w13, #3 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:writes=stack_3 + ldr q0, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:reads=stack_3 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(3*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v8.1q, v29.1d, v12.1d + // High product + pmull2 v9.1q, v29.2d, v12.2d + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v10.1q, v11.1d, v14.1d + + ldr q29, [x0, #(2*16)] + add w14, w13, #2 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:writes=stack_2 + ldr q0, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:reads=stack_2 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(2*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v13.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v13.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v11.1q, v11.2d, v14.2d + eor v10.16b, v10.16b, v11.16b + + ldr q29, [x0, #(1*16)] + add w14, w13, #1 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:writes=stack_1 + ldr q0, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:reads=stack_1 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2, #(1*16)] + + rev64 v29.16b, v29.16b + // Low product + pmull v11.1q, v29.1d, v15.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v15.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + mov d11, v29.d[1] + eor v11.8b, v11.8b, v29.8b + pmull v11.1q, v11.1d, v17.1d + eor v10.16b, v10.16b, v11.16b + + ldr q29, [x0], #(4*16) + add w14, w13, #0 + rev w14, w14 + orr x14, x12, x14, lsl #32 + stp x11, x14, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:writes=stack_0 + ldr q0, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:reads=stack_0 + aesr v0.16b, v18.16b + aesr v0.16b, v19.16b + aesr v0.16b, v20.16b + aesr v0.16b, v21.16b + aesr v0.16b, v22.16b + aesr v0.16b, v23.16b + aesr v0.16b, v24.16b + aesr v0.16b, v25.16b + aesr v0.16b, v26.16b + aese v0.16b, v27.16b + eor v0.16b, v0.16b, v28.16b + eor v0.16b, v0.16b, v29.16b + str q0, [x2], #(4*16) + + rev64 v29.16b, v29.16b + eor v29.16b, v29.16b, v30.16b + // Low product + pmull v11.1q, v29.1d, v16.1d + eor v8.16b, v8.16b, v11.16b + // High product + pmull2 v11.1q, v29.2d, v16.2d + eor v9.16b, v9.16b, v11.16b + // Middle product + ext v11.16b, v29.16b, v29.16b, #8 + eor v11.16b, v11.16b, v29.16b + pmull2 v11.1q, v11.2d, v17.2d + eor v10.16b, v10.16b, v11.16b + eor v0.16b, v8.16b, v9.16b + pmull v1.1q, v9.1d, v7.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v0.16b + eor v1.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v1.16b + pmull v9.1q, v10.1d, v7.1d + eor v8.16b, v8.16b, v9.16b + ext v10.16b, v10.16b, v10.16b, #8 + eor v30.16b, v8.16b, v10.16b + ext v30.16b, v30.16b, v30.16b, #8 + + add w13, w13, #UNROLL +Lloop_unrolled_start_iter_1_end: +Lloop_unrolled_start_end: +Lloop_unrolled_end: + + cbz remainder, Lloop_1x_end +Lloop_1x_start: + + ldr plain_q, [input], #16 + aes_full_block aes_st, plain, res, 0 + str res_q, [output], #16 + + ghash_init_with_tag_0 plain, Ht1, Ht12, tag + ghash_finalize tag + + add ctr, ctr, #1 + sub remainder, remainder, #1 + cbnz remainder, Lloop_1x_start +Lloop_1x_end: + + // Return number of bytes processed + mov x0, byte_len + // Store new authentication tag + rev64 tag.16b, tag.16b + str tag_q, [tag_ptr] + + // Store updated counter + // rev32 rtmp_ctr.16b, rtmp_ctr.16b + // str rtmp_ctr_q, [ivec] + rev ctr_tmp_w, ctr + str ctr_tmp_w, [ivec, #12] + + restore_vregs + restore_gprs + +Ldec_postamble_end: + add sp, sp, #STACK_SIZE + + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__) +#if defined(__ELF__) +// See https: // www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",%progbits +#endif \ No newline at end of file