diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index fe32c6c968d..47326731afe 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -183,9 +183,12 @@ if(ARCH STREQUAL "aarch64")
     test/trampoline-armv8.${ASM_EXT}
     cipher_extra/chacha20_poly1305_armv8.${ASM_EXT}
 
-    fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S
-    fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S
-    fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S
+    fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S
   )
 endif()
 
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S
new file mode 100644
index 00000000000..06ef696de17
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-128.S
@@ -0,0 +1,828 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+.align	4
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	ldr	q16, [x6, #16]				  // load h2k | h1k
+	ldr	q17, [x6, #64]				  // load h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S
new file mode 100644
index 00000000000..89468b97e96
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-192.S
@@ -0,0 +1,828 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_192
+.hidden aes_gcm_dec_kernel_slothy_base_192
+.type   aes_gcm_dec_kernel_slothy_base_192,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_192
+.private_extern	_aes_gcm_dec_kernel_slothy_base_192
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+.align	4
+_aes_gcm_dec_kernel_slothy_base_192:
+aes_gcm_dec_kernel_slothy_base_192:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	ldr	q16, [x6, #16]				  // load h2k | h1k
+	ldr	q17, [x6, #64]				  // load h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S
new file mode 100644
index 00000000000..d3008cd8667
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-dec-slothy-256.S
@@ -0,0 +1,828 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_256
+.hidden aes_gcm_dec_kernel_slothy_base_256
+.type   aes_gcm_dec_kernel_slothy_base_256,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_256
+.private_extern	_aes_gcm_dec_kernel_slothy_base_256
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+.align	4
+_aes_gcm_dec_kernel_slothy_base_256:
+aes_gcm_dec_kernel_slothy_base_256:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	ldr	q16, [x6, #16]				  // load h2k | h1k
+	ldr	q17, [x6, #64]				  // load h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_basic.S
rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-128.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_basic.S
rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-192.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_basic.S b/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_basic.S
rename to crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-enc-slothy-256.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S
similarity index 79%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S
index d23c2e41840..377083c7000 100644
--- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_dual_acc.S
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_basic.S
@@ -39,17 +39,17 @@
 #include <openssl/arm_arch.h>
 .arch   armv8-a+crypto
 .text
-.globl  aes_gcm_enc_kernel_slothy_base_192
-.hidden aes_gcm_enc_kernel_slothy_base_192
-.type   aes_gcm_enc_kernel_slothy_base_192,%function
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
 #elif defined(__APPLE__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
 #include <openssl/arm_arch.h>
 .text
-.globl	_aes_gcm_enc_kernel_slothy_base_192
-.private_extern	_aes_gcm_enc_kernel_slothy_base_192
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
 #else
 #error Unknown configuration
 #endif
@@ -74,17 +74,11 @@ full_blocks     .req x7
 remainder       .req x9
 unroll          .req x10
 
-aes_st0    .req v0
-aes_st0_q  .req q0
+aes_st    .req v0
+aes_st_q  .req q0
 
-aes_st1    .req v2
-aes_st1_q  .req q2
-
-res0       .req v0
-res0_q     .req q0
-
-res1       .req v2
-res1_q     .req q2
+res       .req v0
+res_q     .req q0
 
 ghash_hi    .req v9
 ghash_lo    .req v8
@@ -144,9 +138,6 @@ rk8q   .req q26
 rk9q   .req q27
 rk10q  .req q28
 
-rk11q  .req q15
-rk12q  .req q16
-
 rk0    .req v18
 rk1    .req v19
 rk2    .req v20
@@ -159,9 +150,6 @@ rk8    .req v26
 rk9    .req v27
 rk10   .req v28
 
-rk11   .req v15
-rk12   .req v16
-
 plain   .req v29
 plain_q .req q29
 
@@ -217,10 +205,14 @@ tag_q  .req q11
         ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
 .endm
 
+// Derive number of iterations of unrolled loop and single-block loop
 .macro prepare_loop_counts
         mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
         lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
         udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
         msub remainder, count, unroll, full_blocks
 .endm
 
@@ -231,17 +223,20 @@ tag_q  .req q11
 .macro load_iv
         ldr     rtmp_ctr_q, [ivec]
 
-	mov	constant_temp, #0x100000000   // set up counter increment
+        // set up counter increment
+	mov	constant_temp, #0x100000000
 	movi	rctr_inc.16b,  #0x0
 	fmov	rctr_inc.d[1], constant_temp
 
         rev32   rtmp_ctr.16b, rtmp_ctr.16b
 .endm
 
+// Increase AES counter
 .macro aes_ctr_inc
         add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
 .endm
 
+// Increase AES counter and initialize new AES state
 .macro next_ctr_init_aes aes_st
         rev32  \aes_st\().16b, rtmp_ctr.16b
         aes_ctr_inc
@@ -283,14 +278,13 @@ tag_q  .req q11
 .endm
 
 .macro aesr_final aes_st, plain, out
-        aese \aes_st\().16b, rk11.16b
-        eor3 \out\().16b, \plain\().16b, rk12.16b, \aes_st\().16b
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b
 .endm
 
 .macro aes_full_block aes_st, input, output
         next_ctr_init_aes \aes_st
-        aesr_0_8   \aes_st\(), rk
-        aesr_9_10  \aes_st\(), rk
+        aesr_0_8 \aes_st\(), rk
         aesr_final \aes_st, \input, \output
 .endm
 
@@ -310,8 +304,6 @@ tag_q  .req q11
         load_round_key 8
         load_round_key 9
         load_round_key 10
-        load_round_key 11
-        load_round_key 12
 .endm
 
 /********************************************************************/
@@ -408,52 +400,6 @@ tag_q  .req q11
 /*                    Macros for GHASH udpate                       */
 /********************************************************************/
 
-.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-        eor   \inputa\().16b, \inputa\().16b, tag.16b
-
-        // Low product
-        pmull   ghash_lo.1q,  \inputa\().1d, \Ha\().1d
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_hi.1q, \inputa\().2d, \Ha\().2d
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
-.endm
-
-.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-
-        // Low product
-        pmull   ghash_tmp.1q, \inputa\().1d, \Ha\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_tmp.1q, \inputa\().2d, \Ha\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d
-        eor     ghash_mid.16b, ghash_mid.16b, \inputa\().16b
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
-.endm
-
 .macro ghash_init_0 input, Hk, Hk_mid, tag
         rev64 \input\().16b, \input\().16b
         eor   \input\().16b, \input\().16b, \tag\().16b
@@ -541,19 +487,19 @@ tag_q  .req q11
 /********************************************************************/
 
 .align        4
-_aes_gcm_enc_kernel_slothy_base_192:
-aes_gcm_enc_kernel_slothy_base_192:
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
 #ifdef BORINGSSL_DISPATCH_TEST
         adrp  x9,_BORINGSSL_function_hit@PAGE
         add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
         mov   w10, #1
-        strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
+        strb  w10, [x9,#2] // kFlag_aes_gcm_dec_kernel
 #endif
 
-        AARCH64_SIGN_LINK_REGISTER
+        AARCH64_VALID_CALL_TARGET
         sub sp, sp, #STACK_SIZE
 
-Lenc_preamble_start:
+Ldec_preamble_start:
         save_gprs
         save_vregs
 
@@ -566,33 +512,37 @@ Lenc_preamble_start:
         prepare_loop_counts
         prepare_ghash
 
-Lenc_preamble_end:
+Ldec_preamble_end:
 
         cbz count, Lloop_unrolled_end
 Lloop_unrolled_start:
 
+
         ldr plain_q, [input], #(4*16)
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #(4*16)
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #(4*16)
+
+        load_htable_34
+        ghash_init_1 plain, Ht4, Ht34, tag
 
         ldr plain_q, [input, #(-3*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-3*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-3*16)]
 
-        load_htable_34
-        ghash_init_pair res0, res1, Ht4, Ht3, Ht34
+        ghash_acc_0 plain, Ht3, Ht34
 
         ldr plain_q, [input, #(-2*16)]
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output, #(-2*16)]
-
-        ldr plain_q, [input, #(-1*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-1*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-2*16)]
 
         load_htable_12
-        ghash_acc_pair res0, res1, Ht2, Ht1, Ht12
+        ghash_acc_1 plain, Ht2, Ht12
+
+        ldr plain_q, [input, #(-1*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-1*16)]
 
+        ghash_acc_0 plain, Ht1, Ht12
         ghash_finalize tag
 
         sub count, count, #1
@@ -605,10 +555,10 @@ Lloop_unrolled_end:
 Lloop_1x_start:
 
         ldr plain_q, [input], #16
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #16
-        ghash_init_0 res0, Ht1, Ht12, tag
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #16
 
+        ghash_init_0 plain, Ht1, Ht12, tag
         ghash_finalize tag
 
         sub remainder, remainder, #1
@@ -627,10 +577,9 @@ Lloop_1x_end:
         restore_vregs
         restore_gprs
 
-Lenc_postamble_end:
+Ldec_postamble_end:
         add sp, sp, #STACK_SIZE
 
-        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 #endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S
new file mode 100644
index 00000000000..250722b10ff
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_keep_htable.S
@@ -0,0 +1,587 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x9
+unroll          .req x10
+
+aes_st0   .req v0
+aes_st0_q .req q0
+aes_st1   .req v1
+aes_st1_q .req q1
+aes_st2   .req v2
+aes_st2_q .req q2
+aes_st3   .req v3
+aes_st3_q .req q3
+
+res0      .req v0
+res0_q    .req q0
+res1      .req v1
+res1_q    .req q1
+res2      .req v2
+res2_q    .req q2
+res3      .req v3
+res3_q    .req q3
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q    .req q12
+Ht2q    .req q13
+Ht12q   .req q14
+
+Ht3q    .req q15
+Ht4q    .req q16
+Ht34q   .req q17
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3    .req v15
+Ht4    .req v16
+Ht34   .req v17
+
+Ht5q    .req Ht3q
+Ht6q    .req Ht4q
+Ht56q   .req Ht34q
+
+Ht5    .req Ht3
+Ht6    .req Ht4
+Ht56   .req Ht34
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+plain   .req v29
+plain_q .req q29
+
+rctr_inc   .req v30
+rtmp_ctr   .req v31
+rtmp_ctr_q .req q31
+
+tag    .req v11
+tag_q  .req q11
+
+#define UNROLL 4
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldr     rtmp_ctr_q, [ivec]
+
+	mov	constant_temp, #0x100000000   // set up counter increment
+	movi	rctr_inc.16b,  #0x0
+	fmov	rctr_inc.d[1], constant_temp
+
+        rev32   rtmp_ctr.16b, rtmp_ctr.16b
+.endm
+
+.macro aes_ctr_inc
+        add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
+.endm
+
+.macro next_ctr_init_aes_st aes_st
+        rev32  \aes_st\().16b, rtmp_ctr.16b
+        aes_ctr_inc
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_x4 data0, data1, data2, data3, key
+        aesr \data0\(), \key\()
+        aesr \data1\(), \key\()
+        aesr \data2\(), \key\()
+        aesr \data3\(), \key\()
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_0_8_x4 data0, data1, data2, data3, key
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()0.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()1.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()2.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()3.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()4.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()5.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()6.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()7.16b
+        aesr_x4 \data0\().16b, \data1\().16b, \data2\().16b, \data3\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b
+.endm
+
+.macro load_round_key i
+        ldr rk\()\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        lsr  full_blocks, byte_len, #4
+        udiv count, full_blocks, unroll
+        msub remainder, count, unroll, full_blocks
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_dec_kernel
+#endif
+
+        AARCH64_SIGN_LINK_REGISTER
+        sub sp, sp, #STACK_SIZE
+
+Ldec_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+        load_htable_34
+        load_htable_12
+
+Ldec_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        next_ctr_init_aes_st aes_st0
+        next_ctr_init_aes_st aes_st1
+        next_ctr_init_aes_st aes_st2
+        next_ctr_init_aes_st aes_st3
+
+        aesr_0_8 aes_st0, rk
+        aesr_0_8 aes_st1, rk
+        aesr_0_8 aes_st2, rk
+        aesr_0_8 aes_st3, rk
+
+        ldr plain_q, [input], #(4*16)
+        aesr_final aes_st0, plain, res0
+        str res0_q, [output], #(4*16)
+
+        ghash_init_1 plain, Ht4, Ht34, tag
+
+        ldr plain_q, [input, #(-3*16)]
+        aesr_final aes_st1, plain, res1
+        str res1_q, [output, #(-3*16)]
+
+        ghash_acc_0 plain, Ht3, Ht34
+
+        ldr plain_q, [input, #(-2*16)]
+        aesr_final aes_st2, plain, res2
+        str res2_q, [output, #(-2*16)]
+
+        ghash_acc_1 plain, Ht2, Ht12
+
+        ldr plain_q, [input, #(-1*16)]
+        aesr_final aes_st3, plain, res3
+        str res3_q, [output, #(-1*16)]
+
+        ghash_acc_0 plain, Ht1, Ht12
+        ghash_finalize tag
+
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        next_ctr_init_aes_st aes_st0
+        aesr_0_8 aes_st0, rk
+
+        ldr plain_q, [input], #16
+        aesr_final aes_st0, plain, res0
+        str res0_q, [output], #16
+
+        ghash_init_0 plain, Ht1, Ht12, tag
+        ghash_finalize tag
+
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+        // Store updated counter
+        rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        str rtmp_ctr_q, [ivec]
+
+        restore_vregs
+        restore_gprs
+
+Ldec_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S
new file mode 100644
index 00000000000..2e352527c7b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem2.S
@@ -0,0 +1,598 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x9
+unroll          .req x10
+
+ctr_tmp   .req x14
+ctr_tmp_w .req w14
+
+ivec_0_63    .req x11
+ivec_64_96   .req x12
+ivec_64_96_w .req w12
+
+ctr      .req w13
+ctr_x    .req x13
+
+aes_st    .req v0
+aes_st_q  .req q0
+aes_st_d  .req d0
+
+res       .req v0
+res_q     .req q0
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q    .req q12
+Ht2q    .req q13
+Ht12q   .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q    .req Ht1q
+Ht4q    .req Ht2q
+Ht34q   .req Ht12q
+
+Ht3    .req Ht1
+Ht4    .req Ht2
+Ht34   .req Ht12
+
+Ht5q    .req Ht1q
+Ht6q    .req Ht2q
+Ht56q   .req Ht12q
+
+Ht5    .req Ht1
+Ht6    .req Ht2
+Ht56   .req Ht12
+
+Ht7q    .req Ht1q
+Ht8q    .req Ht2q
+Ht78q   .req Ht12q
+
+Ht7    .req Ht1
+Ht8    .req Ht2
+Ht78   .req Ht12
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+plain   .req v29
+plain_q .req q29
+
+tag    .req v11
+tag_q  .req q11
+
+#define UNROLL 4
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+// Derive number of iterations of unrolled loop and single-block loop
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
+        lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
+        udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldp        ivec_0_63, ivec_64_96, [ivec]
+        stp        ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 0*16)] // @slothy:writes=stack_0
+        stp        ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 1*16)] // @slothy:writes=stack_1
+        stp        ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 2*16)] // @slothy:writes=stack_2
+        stp        ivec_0_63, ivec_64_96, [sp, #(STACK_BASE_AES_ST + 3*16)] // @slothy:writes=stack_3
+
+        lsr        ctr_x, ivec_64_96, #32
+        rev        ctr, ctr
+        orr        ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit
+.endm
+
+.macro next_ctr_init_aes aes_st, loc
+        rev        ctr_tmp_w, ctr
+        str        ctr_tmp_w, [sp, #(STACK_BASE_AES_ST + \loc*16 + 12)] // @slothy:writes=stack_\loc
+        ldr        \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)]  // @slothy:reads=stack_\loc
+        add        ctr, ctr, #1
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold=true
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b
+.endm
+
+.macro aes_full_block aes_st, input, output, loc
+        next_ctr_init_aes \aes_st, \loc
+        aesr_0_8 \aes_st\(), rk
+        aesr_final \aes_st, \input, \output
+.endm
+
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_dec_kernel
+#endif
+
+        AARCH64_VALID_CALL_TARGET
+        sub sp, sp, #STACK_SIZE
+
+Ldec_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+Ldec_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        ldr plain_q, [input], #(4*16)
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #(4*16)
+
+        load_htable_34
+        ghash_init_1 plain, Ht4, Ht34, tag
+
+        ldr plain_q, [input, #(-3*16)]
+        aes_full_block aes_st, plain, res, 1
+        str res_q, [output, #(-3*16)]
+
+        ghash_acc_0 plain, Ht3, Ht34
+
+        ldr plain_q, [input, #(-2*16)]
+        aes_full_block aes_st, plain, res, 2
+        str res_q, [output, #(-2*16)]
+
+        load_htable_12
+        ghash_acc_1 plain, Ht2, Ht12
+
+        ldr plain_q, [input, #(-1*16)]
+        aes_full_block aes_st, plain, res, 3
+        str res_q, [output, #(-1*16)]
+
+        ghash_acc_0 plain, Ht1, Ht12
+
+        ghash_finalize tag
+
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        load_htable_12
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        ldr plain_q, [input], #16
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #16
+
+        ghash_init_0 plain, Ht1, Ht12, tag
+        ghash_finalize tag
+
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+
+        // Store updated counter
+        // rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        // str rtmp_ctr_q, [ivec]
+        rev ctr_tmp_w, ctr
+        str ctr_tmp_w, [ivec, #12]
+
+        restore_vregs
+        restore_gprs
+
+Ldec_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S
new file mode 100644
index 00000000000..5516f11a5d9
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag.S
@@ -0,0 +1,650 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x9
+unroll          .req x10
+
+ctr_tmp   .req x14
+ctr_tmp_w .req w14
+
+ivec_0_63    .req x11
+ivec_64_96   .req x12
+ivec_64_96_w .req w12
+
+ctr      .req w13
+ctr_x    .req x13
+
+aes_st    .req v0
+aes_st_q  .req q0
+aes_st_d  .req d0
+
+res       .req v0
+res_q     .req q0
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q    .req q12
+Ht2q    .req q13
+Ht12q   .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q    .req Ht1q
+Ht4q    .req Ht2q
+Ht34q   .req Ht12q
+
+Ht3    .req Ht1
+Ht4    .req Ht2
+Ht34   .req Ht12
+
+Ht5q    .req Ht1q
+Ht6q    .req Ht2q
+Ht56q   .req Ht12q
+
+Ht5    .req Ht1
+Ht6    .req Ht2
+Ht56   .req Ht12
+
+Ht7q    .req Ht1q
+Ht8q    .req Ht2q
+Ht78q   .req Ht12q
+
+Ht7    .req Ht1
+Ht8    .req Ht2
+Ht78   .req Ht12
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+plain   .req v29
+plain_q .req q29
+
+tag    .req v30
+tag_q  .req q30
+
+#define UNROLL 4
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+// Derive number of iterations of unrolled loop and single-block loop
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
+        lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
+        udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldp        ivec_0_63, ivec_64_96, [ivec]
+        lsr        ctr_x, ivec_64_96, #32
+        rev        ctr, ctr
+        orr        ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit
+.endm
+
+.macro next_ctr_init_aes aes_st, loc
+        add        ctr_tmp_w, ctr, #\loc
+        rev        ctr_tmp_w, ctr_tmp_w
+        orr        ctr_tmp, ivec_64_96, ctr_tmp, lsl #32
+        stp        ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc
+        ldr        \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)]       // @slothy:reads=stack_\loc
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold=true
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b
+.endm
+
+.macro aes_full_block aes_st, input, output, loc
+        next_ctr_init_aes \aes_st, \loc
+        aesr_0_8 \aes_st\(), rk
+        aesr_final \aes_st, \input, \output
+.endm
+
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_dec_kernel
+#endif
+
+        AARCH64_VALID_CALL_TARGET
+        sub sp, sp, #STACK_SIZE
+
+Ldec_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+Ldec_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        load_htable_12
+
+        ldr plain_q, [input, #(3*16)]
+        aes_full_block aes_st, plain, res, 3
+        str res_q, [output, #(3*16)]
+
+        ghash_init_0 plain, Ht1, Ht12
+
+        ldr plain_q, [input, #(2*16)]
+        aes_full_block aes_st, plain, res, 2
+        str res_q, [output, #(2*16)]
+
+        ghash_acc_1 plain, Ht2, Ht12
+
+        load_htable_34
+
+        ldr plain_q, [input, #(1*16)]
+        aes_full_block aes_st, plain, res, 1
+        str res_q, [output, #(1*16)]
+
+        ghash_acc_0 plain, Ht3, Ht34
+
+        ldr plain_q, [input], #(4*16)
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #(4*16)
+
+        ghash_acc_with_tag_1 plain, Ht4, Ht34, tag
+        ghash_finalize tag
+
+        add ctr, ctr, #UNROLL
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        load_htable_12
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        ldr plain_q, [input], #16
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #16
+
+        ghash_init_with_tag_0 plain, Ht1, Ht12, tag
+        ghash_finalize tag
+
+        add ctr, ctr, #1
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+
+        // Store updated counter
+        rev ctr_tmp_w, ctr
+        str ctr_tmp_w, [ivec, #12]
+
+        restore_vregs
+        restore_gprs
+
+Ldec_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S
new file mode 100644
index 00000000000..b8b2e0f480b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/dec/aesv8-gcm-armv8-dec-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S
@@ -0,0 +1,633 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_dec_kernel_slothy_base_128
+.hidden aes_gcm_dec_kernel_slothy_base_128
+.type   aes_gcm_dec_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_dec_kernel_slothy_base_128
+.private_extern	_aes_gcm_dec_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x16
+unroll          .req x10
+
+ctr_tmp   .req x14
+ctr_tmp_w .req w14
+
+ivec_0_63    .req x11
+ivec_64_96   .req x12
+ivec_64_96_w .req w12
+
+ctr      .req w13
+ctr_x    .req x13
+
+aes_st    .req v0
+aes_st_q  .req q0
+aes_st_d  .req d0
+
+res       .req v0
+res_q     .req q0
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q   .req q12
+Ht2q   .req q13
+Ht12q  .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q   .req q15
+Ht4q   .req q16
+Ht34q  .req q17
+
+Ht3    .req v15
+Ht4    .req v16
+Ht34   .req v17
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+plain   .req v29
+plain_q .req q29
+
+tag    .req v30
+tag_q  .req q30
+
+#define UNROLL 4
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+// Derive number of iterations of unrolled loop and single-block loop
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
+        lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
+        udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldp        ivec_0_63, ivec_64_96, [ivec]
+        lsr        ctr_x, ivec_64_96, #32
+        rev        ctr, ctr
+        orr        ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit
+.endm
+
+.macro next_ctr_init_aes aes_st, loc
+        add        ctr_tmp_w, ctr, #\loc
+        rev        ctr_tmp_w, ctr_tmp_w
+        orr        ctr_tmp, ivec_64_96, ctr_tmp, lsl #32
+        stp        ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc
+        ldr        \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)]       // @slothy:reads=stack_\loc
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold=true
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \aes_st\().16b, rk10.16b, \plain\().16b
+.endm
+
+.macro aes_full_block aes_st, input, output, loc
+        next_ctr_init_aes \aes_st, \loc
+        aesr_0_8 \aes_st\(), rk
+        aesr_final \aes_st, \input, \output
+.endm
+
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_dec_kernel_slothy_base_128:
+aes_gcm_dec_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_dec_kernel
+#endif
+
+        AARCH64_VALID_CALL_TARGET
+        sub sp, sp, #STACK_SIZE
+
+Ldec_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+        load_htable_12
+        load_htable_34
+
+Ldec_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        ldr plain_q, [input, #(3*16)]
+        aes_full_block aes_st, plain, res, 3
+        str res_q, [output, #(3*16)]
+
+        ghash_init_0 plain, Ht1, Ht12
+
+        ldr plain_q, [input, #(2*16)]
+        aes_full_block aes_st, plain, res, 2
+        str res_q, [output, #(2*16)]
+
+        ghash_acc_1 plain, Ht2, Ht12
+
+        ldr plain_q, [input, #(1*16)]
+        aes_full_block aes_st, plain, res, 1
+        str res_q, [output, #(1*16)]
+
+        ghash_acc_0 plain, Ht3, Ht34
+
+        ldr plain_q, [input], #(4*16)
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #(4*16)
+
+        ghash_acc_with_tag_1 plain, Ht4, Ht34, tag
+        ghash_finalize tag
+
+        add ctr, ctr, #UNROLL
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        ldr plain_q, [input], #16
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #16
+
+        ghash_init_with_tag_0 plain, Ht1, Ht12, tag
+        ghash_finalize tag
+
+        add ctr, ctr, #1
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+
+        // Store updated counter
+        // rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        // str rtmp_ctr_q, [ivec]
+        rev ctr_tmp_w, ctr
+        str ctr_tmp_w, [ivec, #12]
+
+        restore_vregs
+        restore_gprs
+
+Ldec_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S
similarity index 81%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S
index 4a74ddb91e8..d0d57944901 100644
--- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc_keep_htable.S
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_basic.S
@@ -74,17 +74,11 @@ full_blocks     .req x7
 remainder       .req x9
 unroll          .req x10
 
-aes_st0    .req v0
-aes_st0_q  .req q0
+aes_st    .req v0
+aes_st_q  .req q0
 
-aes_st1    .req v2
-aes_st1_q  .req q2
-
-res0       .req v0
-res0_q     .req q0
-
-res1       .req v2
-res1_q     .req q2
+res       .req v0
+res_q     .req q0
 
 ghash_hi    .req v9
 ghash_lo    .req v8
@@ -108,13 +102,29 @@ Ht1    .req v12
 Ht2    .req v13
 Ht12   .req v14
 
-Ht3q    .req q15
-Ht4q    .req q16
-Ht34q   .req q17
+Ht3q    .req Ht1q
+Ht4q    .req Ht2q
+Ht34q   .req Ht12q
+
+Ht3    .req Ht1
+Ht4    .req Ht2
+Ht34   .req Ht12
+
+Ht5q    .req Ht1q
+Ht6q    .req Ht2q
+Ht56q   .req Ht12q
 
-Ht3    .req v15
-Ht4    .req v16
-Ht34   .req v17
+Ht5    .req Ht1
+Ht6    .req Ht2
+Ht56   .req Ht12
+
+Ht7q    .req Ht1q
+Ht8q    .req Ht2q
+Ht78q   .req Ht12q
+
+Ht7    .req Ht1
+Ht8    .req Ht2
+Ht78   .req Ht12
 
 rk0q   .req q18
 rk1q   .req q19
@@ -195,10 +205,14 @@ tag_q  .req q11
         ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
 .endm
 
+// Derive number of iterations of unrolled loop and single-block loop
 .macro prepare_loop_counts
         mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
         lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
         udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
         msub remainder, count, unroll, full_blocks
 .endm
 
@@ -209,17 +223,20 @@ tag_q  .req q11
 .macro load_iv
         ldr     rtmp_ctr_q, [ivec]
 
-	mov	constant_temp, #0x100000000   // set up counter increment
+        // set up counter increment
+	mov	constant_temp, #0x100000000
 	movi	rctr_inc.16b,  #0x0
 	fmov	rctr_inc.d[1], constant_temp
 
         rev32   rtmp_ctr.16b, rtmp_ctr.16b
 .endm
 
+// Increase AES counter
 .macro aes_ctr_inc
         add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
 .endm
 
+// Increase AES counter and initialize new AES state
 .macro next_ctr_init_aes aes_st
         rev32  \aes_st\().16b, rtmp_ctr.16b
         aes_ctr_inc
@@ -272,7 +289,7 @@ tag_q  .req q11
 .endm
 
 .macro load_round_key i
-        ldr rk\()\i\()q, [key, #((\i)*16)]
+        ldr rk\i\()q, [key, #((\i)*16)]
 .endm
 
 .macro load_round_keys
@@ -311,6 +328,22 @@ tag_q  .req q11
         ldr \dst_q, [Htable, #80]
 .endm
 
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
 .macro load_h12 dst, dst_q
         ldr \dst_q, [Htable, #16]
 .endm
@@ -319,13 +352,24 @@ tag_q  .req q11
         ldr \dst_q, [Htable, #64]
 .endm
 
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
 .macro load_full_htable
         load_h1  Ht1,  Ht1q
         load_h2  Ht2,  Ht2q
         load_h3  Ht3,  Ht3q
         load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
         load_h12 Ht12, Ht12q
         load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
 .endm
 
 .macro load_htable_12
@@ -340,56 +384,22 @@ tag_q  .req q11
         load_h34 Ht34, Ht34q
 .endm
 
-/********************************************************************/
-/*                    Macros for GHASH udpate                       */
-/********************************************************************/
-
-.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-        eor   \inputa\().16b, \inputa\().16b, tag.16b
-
-        // Low product
-        pmull   ghash_lo.1q,  \inputa\().1d, \Ha\().1d
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_hi.1q, \inputa\().2d, \Ha\().2d
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
 .endm
 
-.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-
-        // Low product
-        pmull   ghash_tmp.1q, \inputa\().1d, \Ha\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_tmp.1q, \inputa\().2d, \Ha\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d
-        eor     ghash_mid.16b, ghash_mid.16b, \inputa\().16b
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
 .endm
 
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
 .macro ghash_init_0 input, Hk, Hk_mid, tag
         rev64 \input\().16b, \input\().16b
         eor   \input\().16b, \input\().16b, \tag\().16b
@@ -486,7 +496,7 @@ aes_gcm_enc_kernel_slothy_base_128:
         strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
 #endif
 
-        AARCH64_SIGN_LINK_REGISTER
+        AARCH64_VALID_CALL_TARGET
         sub sp, sp, #STACK_SIZE
 
 Lenc_preamble_start:
@@ -499,9 +509,6 @@ Lenc_preamble_start:
         load_tag
         load_iv
 
-        load_htable_34
-        load_htable_12
-
         prepare_loop_counts
         prepare_ghash
 
@@ -511,24 +518,30 @@ Lenc_preamble_end:
 Lloop_unrolled_start:
 
         ldr plain_q, [input], #(4*16)
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #(4*16)
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #(4*16)
+
+        load_htable_34
+        ghash_init_1 res, Ht4, Ht34, tag
 
         ldr plain_q, [input, #(-3*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-3*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-3*16)]
 
-        ghash_init_pair res0, res1, Ht4, Ht3, Ht34
+        ghash_acc_0 res, Ht3, Ht34
 
         ldr plain_q, [input, #(-2*16)]
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output, #(-2*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-2*16)]
+
+        load_htable_12
+        ghash_acc_1 res, Ht2, Ht12
 
         ldr plain_q, [input, #(-1*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-1*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-1*16)]
 
-        ghash_acc_pair res0, res1, Ht2, Ht1, Ht12
+        ghash_acc_0 res, Ht1, Ht12
 
         ghash_finalize tag
 
@@ -536,13 +549,15 @@ Lloop_unrolled_start:
         cbnz count, Lloop_unrolled_start
 Lloop_unrolled_end:
 
+        load_htable_12
+
         cbz remainder, Lloop_1x_end
 Lloop_1x_start:
 
         ldr plain_q, [input], #16
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #16
-        ghash_init_0 res0, Ht1, Ht12, tag
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #16
+        ghash_init_0 res, Ht1, Ht12, tag
 
         ghash_finalize tag
 
@@ -565,7 +580,6 @@ Lloop_1x_end:
 Lenc_postamble_end:
         add sp, sp, #STACK_SIZE
 
-        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 #endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-128.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_dual_acc_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_ilp.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_ilp.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_ilp.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable_rotate.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_keep_htable_rotate.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_keep_htable_rotate.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_full.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_full.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_full.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_full.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_basic.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_basic.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_basic.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_ilp.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x6_ilp.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x6_ilp.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_basic.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_basic.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_basic.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S
new file mode 100644
index 00000000000..f78f8146a43
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_dual_acc_late_tag.S
@@ -0,0 +1,716 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_enc_kernel_slothy_base_128
+.hidden aes_gcm_enc_kernel_slothy_base_128
+.type   aes_gcm_enc_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_enc_kernel_slothy_base_128
+.private_extern	_aes_gcm_enc_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x9
+unroll          .req x10
+
+aes_st0   .req v0
+aes_st0_q .req q0
+aes_st1   .req v1
+aes_st1_q .req q1
+aes_st2   .req v2
+aes_st2_q .req q2
+aes_st3   .req v3
+aes_st3_q .req q3
+aes_st4   .req v20
+aes_st4_q .req q20
+aes_st5   .req v21
+aes_st5_q .req q21
+aes_st6   .req v22
+aes_st6_q .req q22
+aes_st7   .req v23
+aes_st7_q .req q23
+
+res0      .req v4
+res0_q    .req q4
+res1      .req v5
+res1_q    .req q5
+res2      .req v6
+res2_q    .req q6
+res3      .req v24
+res3_q    .req q24
+res4      .req v25
+res4_q    .req q25
+res5      .req v26
+res5_q    .req q26
+res6      .req v27
+res6_q    .req q27
+res7      .req v28
+res7_q    .req q28
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+
+ghash_hi0   .req ghash_hi
+ghash_lo0   .req ghash_lo
+ghash_mid0  .req ghash_mid
+
+ghash_hi1    .req v15
+ghash_lo1    .req v16
+ghash_mid1   .req v17
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q    .req q12
+Ht2q    .req q13
+Ht12q   .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q    .req Ht1q
+Ht4q    .req Ht2q
+Ht34q   .req Ht12q
+
+Ht3    .req Ht1
+Ht4    .req Ht2
+Ht34   .req Ht12
+
+Ht5q    .req Ht1q
+Ht6q    .req Ht2q
+Ht56q   .req Ht12q
+
+Ht5    .req Ht1
+Ht6    .req Ht2
+Ht56   .req Ht12
+
+Ht7q    .req Ht1q
+Ht8q    .req Ht2q
+Ht78q   .req Ht12q
+
+Ht7    .req Ht1
+Ht8    .req Ht2
+Ht78   .req Ht12
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q18 //q20
+rk3q   .req q19 //q21
+rk4q   .req q18 //q22
+rk5q   .req q19 //q23
+rk6q   .req q18 //q24
+rk7q   .req q19 //q25
+rk8q   .req q18 //q26
+rk9q   .req q19 //q27
+rk10q  .req q18 //q28
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v18 //v20
+rk3    .req v19 //v21
+rk4    .req v18 //v22
+rk5    .req v19 //v23
+rk6    .req v18 //v24
+rk7    .req v19 //v25
+rk8    .req v18 //v26
+rk9    .req v19 //v27
+rk10   .req v18 //v28
+
+plain0   .req res0
+plain0_q .req res0_q
+plain1   .req res1
+plain1_q .req res1_q
+plain2   .req res2
+plain2_q .req res2_q
+plain3   .req res3
+plain3_q .req res3_q
+plain4   .req res4
+plain4_q .req res4_q
+plain5   .req res5
+plain5_q .req res5_q
+plain6   .req res6
+plain6_q .req res6_q
+plain7   .req res7
+plain7_q .req res7_q
+
+rctr_inc   .req v30
+rtmp_ctr   .req v31
+rtmp_ctr_q .req q31
+
+tag    .req v11
+tag_q  .req q11
+
+#define UNROLL 8
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        lsr  full_blocks, byte_len, #4
+        udiv count, full_blocks, unroll
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldr     rtmp_ctr_q, [ivec]
+
+	mov	constant_temp, #0x100000000   // set up counter increment
+	movi	rctr_inc.16b,  #0x0
+	fmov	rctr_inc.d[1], constant_temp
+
+        rev32   rtmp_ctr.16b, rtmp_ctr.16b
+.endm
+
+.macro aes_ctr_inc
+        add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
+.endm
+
+.macro next_ctr_init_aes aes_st
+        rev32  \aes_st\().16b, rtmp_ctr.16b
+        aes_ctr_inc
+.endm
+
+.macro next_ctr_init_aes_x8 aes_st
+        next_ctr_init_aes \aes_st\()0
+        next_ctr_init_aes \aes_st\()1
+        next_ctr_init_aes \aes_st\()2
+        next_ctr_init_aes \aes_st\()3
+        next_ctr_init_aes \aes_st\()4
+        next_ctr_init_aes \aes_st\()5
+        next_ctr_init_aes \aes_st\()6
+        next_ctr_init_aes \aes_st\()7
+.endm
+
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+
+.macro aesr_x4 st0, st1, st2, st3, rk // @slothy:no-unfold
+        aesr \st0, \rk
+        aesr \st1, \rk
+        aesr \st2, \rk
+        aesr \st3, \rk
+.endm
+
+.macro aese_x4 st0, st1, st2, st3, rk // @slothy:no-unfold
+        aese \st0, \rk
+        aese \st1, \rk
+        aese \st2, \rk
+        aese \st3, \rk
+.endm
+
+.macro aesr_x8 i
+        load_round_key \i
+        aesr_x4 aes_st0.16b, aes_st1.16b, aes_st2.16b, aes_st3.16b, rk\i\().16b
+        aesr_x4 aes_st4.16b, aes_st5.16b, aes_st6.16b, aes_st7.16b, rk\i\().16b
+.endm
+
+.macro aese_x8 i
+        load_round_key \i
+        aese_x4 aes_st0.16b, aes_st1.16b, aes_st2.16b, aes_st3.16b, rk\i\().16b
+        aese_x4 aes_st4.16b, aes_st5.16b, aes_st6.16b, aes_st7.16b, rk\i\().16b
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk9.16b
+        eor3 \out\().16b, \plain\().16b, rk10.16b, \aes_st\().16b
+.endm
+
+// Load i-th round key
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid, i
+        // Low product
+        pmull   ghash_lo\i\().1q,  \inputa\().1d, \Ha\().1d
+        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
+        eor     ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_hi\i\().1q, \inputa\().2d, \Ha\().2d
+        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
+        eor     ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b
+        // Middle product
+        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
+        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
+        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
+        pmull2  ghash_mid\i\().1q, ghash_tmp.2d, \Hk_mid\().2d
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid\i\().16b, ghash_mid\i\().16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid, i
+        // Low product
+        pmull   ghash_tmp.1q, \inputa\().1d, \Ha\().1d
+        eor     ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b
+        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
+        eor     ghash_lo\i\().16b, ghash_lo\i\().16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \inputa\().2d, \Ha\().2d
+        eor     ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b
+        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
+        eor     ghash_hi\i\().16b, ghash_hi\i\().16b, ghash_tmp.16b
+        // Middle product
+        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
+        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
+        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
+        pmull2  \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid\i\().16b, ghash_mid\i\().16b, \inputa\().16b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid\i\().16b, ghash_mid\i\().16b, ghash_tmp.16b
+.endm
+
+.macro ghash_init_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro store_0_8 res
+        stp \res\()0_q, \res\()1_q, [output], #(8*16)
+        stp \res\()2_q, \res\()3_q, [output, #(-6*16)]
+        stp \res\()4_q, \res\()5_q, [output, #(-4*16)]
+        stp \res\()6_q, \res\()7_q, [output, #(-2*16)]
+        rev64 \res\()0.16b, \res\()0.16b
+        rev64 \res\()1.16b, \res\()1.16b
+        rev64 \res\()2.16b, \res\()2.16b
+        rev64 \res\()3.16b, \res\()3.16b
+        rev64 \res\()4.16b, \res\()4.16b
+        rev64 \res\()5.16b, \res\()5.16b
+        rev64 \res\()6.16b, \res\()6.16b
+        rev64 \res\()7.16b, \res\()7.16b
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_enc_kernel_slothy_base_128:
+aes_gcm_enc_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
+#endif
+
+        AARCH64_SIGN_LINK_REGISTER
+        sub sp, sp, #STACK_SIZE
+
+Lenc_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+Lenc_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        next_ctr_init_aes aes_st0 // @slothy:pre=true
+        next_ctr_init_aes aes_st1 // @slothy:pre=true
+        next_ctr_init_aes aes_st2 // @slothy:pre=true
+        next_ctr_init_aes aes_st3 // @slothy:pre=true
+        next_ctr_init_aes aes_st4 // @slothy:pre=true
+        next_ctr_init_aes aes_st5 // @slothy:pre=true
+        next_ctr_init_aes aes_st6 // @slothy:pre=true
+        next_ctr_init_aes aes_st7
+
+        aesr_x8 0 // @slothy:core=true
+        aesr_x8 1 // @slothy:core=true
+        aesr_x8 2 // @slothy:core=true
+        aesr_x8 3 // @slothy:core=true
+        aesr_x8 4 // @slothy:core=true
+        aesr_x8 5 // @slothy:core=true
+        aesr_x8 6 // @slothy:core=true
+        aesr_x8 7 // @slothy:core=true
+        aesr_x8 8 // @slothy:core=true
+        aese_x8 9 // @slothy:core=true
+
+        load_round_key 10
+        ldp plain0_q, plain1_q, [input], #(8*16)
+        eor3 res0.16b, plain0.16b, rk10.16b, aes_st0.16b // @slothy:core=true
+        eor3 res1.16b, plain1.16b, rk10.16b, aes_st1.16b // @slothy:core=true
+        ldp plain2_q, plain3_q, [input, #(-6*16)]
+        eor3 res2.16b, plain2.16b, rk10.16b, aes_st2.16b // @slothy:core=true
+        eor3 res3.16b, plain3.16b, rk10.16b, aes_st3.16b // @slothy:core=true
+        ldp plain4_q, plain5_q, [input, #(-4*16)]
+        eor3 res4.16b, plain4.16b, rk10.16b, aes_st4.16b // @slothy:core=true
+        eor3 res5.16b, plain5.16b, rk10.16b, aes_st5.16b // @slothy:core=true
+        ldp plain6_q, plain7_q, [input, #(-2*16)]
+        eor3 res6.16b, plain6.16b, rk10.16b, aes_st6.16b // @slothy:core=true
+        eor3 res7.16b, plain7.16b, rk10.16b, aes_st7.16b // @slothy:core=true
+        store_0_8 res
+
+        eor   res0.16b, res0.16b, tag.16b
+
+        load_htable_78
+        ghash_init_pair res0, res1, Ht8, Ht7, Ht78, 0  // @slothy:post=true
+        load_htable_56
+        ghash_init_pair res2, res3, Ht6, Ht5, Ht56, 1  // @slothy:post=true
+        load_htable_34
+        ghash_acc_pair res4, res5, Ht4, Ht3, Ht34, 0   // @slothy:post=true
+        load_htable_12
+        ghash_acc_pair res6, res7, Ht2, Ht1, Ht12, 1   // @slothy:post=true
+
+        eor ghash_lo.16b,  ghash_lo0.16b,  ghash_lo1.16b
+        eor ghash_hi.16b,  ghash_hi0.16b,  ghash_hi1.16b
+        eor ghash_mid.16b, ghash_mid0.16b, ghash_mid1.16b
+
+        ghash_finalize tag                             // @slothy:post=true
+
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        load_htable_12
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        next_ctr_init_aes aes_st0
+
+        load_round_key 0
+        aesr aes_st0.16b, rk0.16b
+
+        load_round_key 1
+        aesr aes_st0.16b, rk1.16b
+
+        load_round_key 2
+        aesr aes_st0.16b, rk2.16b
+
+        load_round_key 3
+        aesr aes_st0.16b, rk3.16b
+
+        load_round_key 4
+        aesr aes_st0.16b, rk4.16b
+
+        load_round_key 5
+        aesr aes_st0.16b, rk5.16b
+
+        load_round_key 6
+        aesr aes_st0.16b, rk6.16b
+
+        load_round_key 7
+        aesr aes_st0.16b, rk7.16b
+
+        load_round_key 8
+        aesr aes_st0.16b, rk8.16b
+
+        load_round_key 9
+        aese aes_st0.16b, rk9.16b
+
+        load_round_key 10
+        ldr plain0_q, [input], #16
+        eor3 res0.16b, plain0.16b, rk10.16b, aes_st0.16b
+        str res0_q, [output], #16
+
+        ghash_init_0 res0, Ht1, Ht12, tag
+        ghash_finalize tag
+
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+        // Store updated counter
+        rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        str rtmp_ctr_q, [ivec]
+
+        restore_vregs
+        restore_gprs
+
+Lenc_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_manual_eor3.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_manual_eor3.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_ilp_rotate_manual_eor3.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_ilp_rotate_manual_eor3.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_reload_ldp_stp_simpler_manual_rotate.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S
new file mode 100644
index 00000000000..6e891d62a9c
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S
@@ -0,0 +1,752 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_enc_kernel_slothy_base_128
+.hidden aes_gcm_enc_kernel_slothy_base_128
+.type   aes_gcm_enc_kernel_slothy_base_128,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_enc_kernel_slothy_base_128
+.private_extern	_aes_gcm_enc_kernel_slothy_base_128
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x16
+unroll          .req x10
+
+ctr_tmp   .req x14
+ctr_tmp_w .req w14
+
+ivec_0_63    .req x11
+ivec_64_96   .req x12
+ivec_64_96_w .req w12
+
+ctr      .req w13
+ctr_x    .req x13
+
+aes_st    .req v0
+aes_st_q  .req q0
+aes_st_d  .req d0
+
+res       .req v0
+res_q     .req q0
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q   .req q12
+Ht2q   .req q13
+Ht12q  .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q   .req q15
+Ht4q   .req q16
+Ht34q  .req q17
+
+Ht3    .req v15
+Ht4    .req v16
+Ht34   .req v17
+
+Ht5q   .req q15
+Ht6q   .req q16
+Ht56q  .req q17
+
+Ht5    .req v15
+Ht6    .req v16
+Ht56   .req v17
+
+Ht7q   .req q15
+Ht8q   .req q16
+Ht78q  .req q17
+
+Ht7    .req v15
+Ht8    .req v16
+Ht78   .req v17
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk10_lo .req x20
+rk10_hi .req x21
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+plain   .req v29
+plain_q .req q29
+
+plain_lo .req x22
+plain_hi .req x23
+
+tag    .req v30
+tag_q  .req q30
+
+#define UNROLL 8
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS + UNROLL*16)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+#define STACK_BASE_AES_ST (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+// Derive number of iterations of unrolled loop and single-block loop
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
+        lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
+        udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldp        ivec_0_63, ivec_64_96, [ivec]
+        lsr        ctr_x, ivec_64_96, #32
+        rev        ctr, ctr
+        orr        ivec_64_96_w, ivec_64_96_w, ivec_64_96_w // clear top 32 bit
+.endm
+
+.macro next_ctr_init_aes aes_st, loc
+        add        ctr_tmp_w, ctr, #\loc
+        rev        ctr_tmp_w, ctr_tmp_w
+        orr        ctr_tmp, ivec_64_96, ctr_tmp, lsl #32
+        stp        ivec_0_63, ctr_tmp, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc
+        ldr        \aes_st\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)]       // @slothy:reads=stack_\loc
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold=true
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out, loc
+        aese \aes_st\().16b, rk9.16b
+        eor \plain\()_lo, \plain\()_lo, rk10_lo
+        eor \plain\()_hi, \plain\()_hi, rk10_hi
+        stp \plain\()_lo, \plain\()_hi, [sp, #(STACK_BASE_AES_ST + \loc*16)] // @slothy:writes=stack_\loc
+        ldr \plain\()_q, [sp, #(STACK_BASE_AES_ST + \loc*16)]       // @slothy:reads=stack_\loc
+        eor \out\().16b, \plain\().16b, \aes_st\().16b
+.endm
+
+.macro aes_full_block aes_st, input, output, loc
+        next_ctr_init_aes \aes_st, \loc
+        aesr_0_8 \aes_st\(), rk
+        aesr_final \aes_st, \input, \output, \loc
+.endm
+
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_key_scalar i
+        ldp rk\i\()_lo, rk\i\()_hi, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key_scalar 10
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h5 dst, dst_q
+        ldr \dst_q, [Htable, #96]
+.endm
+
+.macro load_h6 dst, dst_q
+        ldr \dst_q, [Htable, #128]
+.endm
+
+.macro load_h7 dst, dst_q
+        ldr \dst_q, [Htable, #144]
+.endm
+
+.macro load_h8 dst, dst_q
+        ldr \dst_q, [Htable, #176]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_h56 dst, dst_q
+        ldr \dst_q, [Htable, #112]
+.endm
+
+.macro load_h78 dst, dst_q
+        ldr \dst_q, [Htable, #160]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_56
+        load_h5  Ht5,  Ht5q
+        load_h6  Ht6,  Ht6q
+        load_h56 Ht56, Ht56q
+.endm
+
+.macro load_htable_78
+        load_h7  Ht7,  Ht7q
+        load_h8  Ht8,  Ht8q
+        load_h78 Ht78, Ht78q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid
+        rev64 \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_init_with_tag_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_with_tag_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_with_tag_0 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_with_tag_1 input, Hk, Hk_mid, tag
+        rev64   \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_enc_kernel_slothy_base_128:
+aes_gcm_enc_kernel_slothy_base_128:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
+#endif
+
+        AARCH64_VALID_CALL_TARGET
+        sub sp, sp, #STACK_SIZE
+
+Lenc_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+Lenc_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+
+        ldp plain_lo, plain_hi, [input, #(7*16)]
+        aes_full_block aes_st, plain, res, 7
+        str res_q, [output, #(7*16)]
+
+        load_htable_12
+        ghash_init_0 res, Ht1, Ht12
+
+        ldp plain_lo, plain_hi, [input, #(6*16)]
+        aes_full_block aes_st, plain, res, 6
+        str res_q, [output, #(6*16)]
+
+        ghash_acc_1 res, Ht2, Ht12
+
+        ldp plain_lo, plain_hi, [input, #(5*16)]
+        aes_full_block aes_st, plain, res, 5
+        str res_q, [output, #(5*16)]
+
+        load_htable_34
+        ghash_acc_0 res, Ht3, Ht34
+
+        sub count, count, #1
+        cbz count, Lloop_unrolled_core_end
+Lloop_unrolled_start:
+
+        ldp plain_lo, plain_hi, [input, #(4*16)]
+        aes_full_block aes_st, plain, res, 4
+        str res_q, [output, #(4*16)]
+
+        ghash_acc_1 res, Ht4, Ht34
+
+        ldp plain_lo, plain_hi, [input, #(3*16)]
+        aes_full_block aes_st, plain, res, 3
+        str res_q, [output, #(3*16)]
+
+        load_htable_56
+        ghash_acc_0 res, Ht5, Ht56
+
+        ldp plain_lo, plain_hi, [input, #(2*16)]
+        aes_full_block aes_st, plain, res, 2
+        str res_q, [output, #(2*16)]
+
+        ghash_acc_1 res, Ht6, Ht56
+
+        ldp plain_lo, plain_hi, [input, #(1*16)]
+        aes_full_block aes_st, plain, res, 1
+        str res_q, [output, #(1*16)]
+
+        load_htable_78
+        ghash_acc_0 res, Ht7, Ht78
+
+        ldp plain_lo, plain_hi, [input], #(8*16)
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #(8*16)
+
+        ghash_acc_with_tag_1 res, Ht8, Ht78, tag
+
+        ghash_finalize tag
+
+        add ctr, ctr, #UNROLL
+
+        ldp plain_lo, plain_hi, [input, #(7*16)]
+        aes_full_block aes_st, plain, res, 7
+        str res_q, [output, #(7*16)]
+
+        load_htable_12
+        ghash_init_0 res, Ht1, Ht12
+
+        ldp plain_lo, plain_hi, [input, #(6*16)]
+        aes_full_block aes_st, plain, res, 6
+        str res_q, [output, #(6*16)]
+
+        ghash_acc_1 res, Ht2, Ht12
+
+        ldp plain_lo, plain_hi, [input, #(5*16)]
+        aes_full_block aes_st, plain, res, 5
+        str res_q, [output, #(5*16)]
+
+        load_htable_34
+        ghash_acc_0 res, Ht3, Ht34
+
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_core_end:
+
+        ldp plain_lo, plain_hi, [input, #(4*16)]
+        aes_full_block aes_st, plain, res, 4
+        str res_q, [output, #(4*16)]
+
+        ghash_acc_1 res, Ht4, Ht34
+
+        ldp plain_lo, plain_hi, [input, #(3*16)]
+        aes_full_block aes_st, plain, res, 3
+        str res_q, [output, #(3*16)]
+
+        load_htable_56
+        ghash_acc_0 res, Ht5, Ht56
+
+        ldp plain_lo, plain_hi, [input, #(2*16)]
+        aes_full_block aes_st, plain, res, 2
+        str res_q, [output, #(2*16)]
+
+        ghash_acc_1 res, Ht6, Ht56
+
+        ldp plain_lo, plain_hi, [input, #(1*16)]
+        aes_full_block aes_st, plain, res, 1
+        str res_q, [output, #(1*16)]
+
+        load_htable_78
+        ghash_acc_0 res, Ht7, Ht78
+
+        ldp plain_lo, plain_hi, [input], #(8*16)
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #(8*16)
+
+        ghash_acc_with_tag_1 res, Ht8, Ht78, tag
+
+        ghash_finalize tag  // @slothy:late
+
+        add ctr, ctr, #UNROLL
+
+Lloop_unrolled_end:
+
+        load_htable_12
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        ldp plain_lo, plain_hi, [input], #16
+        aes_full_block aes_st, plain, res, 0
+        str res_q, [output], #16
+        ghash_init_with_tag_0 res, Ht1, Ht12, tag
+
+        ghash_finalize tag
+
+        add ctr, ctr, #1
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+
+        // Store updated counter
+        // rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        // str rtmp_ctr_q, [ivec]
+        rev ctr_tmp_w, ctr
+        str ctr_tmp_w, [ivec, #12]
+
+        restore_vregs
+        restore_gprs
+
+Lenc_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S
new file mode 100644
index 00000000000..6d450e4d22b
--- /dev/null
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_basic.S
@@ -0,0 +1,548 @@
+// Copyright (c) 2022, ARM Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+// Author: Hanno Becker <hannobecker@posteo.de>
+//
+// This file was derived from the assembly generated from aesv8-gcm-armv8.pl,
+// written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
+// and derived from https://github.com/ARM-software/AArch64cryptolib, original
+// author Samuel Lee <Samuel.Lee@arm.com>.
+//
+// The code below is a 'clean' AArch64 implementation of AES-GCM emphasizing
+// the logic of the computation. It is meant as the input to manual audits /
+// formal verification, as well as automated micro-optimization such as done
+// by the SLOTHY superoptimizer (https://github.com/slothy-optimizer/slothy).
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__)
+#if defined(__ELF__)
+#include <openssl/boringssl_prefix_symbols_asm.h>
+#include <openssl/arm_arch.h>
+.arch   armv8-a+crypto
+.text
+.globl  aes_gcm_enc_kernel_slothy_base_192
+.hidden aes_gcm_enc_kernel_slothy_base_192
+.type   aes_gcm_enc_kernel_slothy_base_192,%function
+#elif defined(__APPLE__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+.text
+.globl	_aes_gcm_enc_kernel_slothy_base_192
+.private_extern	_aes_gcm_enc_kernel_slothy_base_192
+#else
+#error Unknown configuration
+#endif
+
+#if __ARM_MAX_ARCH__ >= 8
+
+// Arguments
+input    .req x0
+len_bits .req x1
+output   .req x2
+tag_ptr  .req x3
+ivec     .req x4
+key      .req x5
+Htable   .req x6
+
+byte_len .req x15
+
+constant_temp .req x25
+
+count           .req x1
+full_blocks     .req x7
+remainder       .req x9
+unroll          .req x10
+
+aes_st    .req v0
+aes_st_q  .req q0
+
+res       .req v0
+res_q     .req q0
+
+ghash_hi    .req v9
+ghash_lo    .req v8
+ghash_mid   .req v10
+ghash_mid_d .req d10
+
+ghash_tmp   .req v11
+ghash_tmp_d .req d11
+
+ghash_mod   .req v7
+ghash_mod_d .req d7
+
+modulo_tmp0 .req v0
+modulo_tmp1 .req v1
+
+Ht1q    .req q12
+Ht2q    .req q13
+Ht12q   .req q14
+
+Ht1    .req v12
+Ht2    .req v13
+Ht12   .req v14
+
+Ht3q    .req Ht1q
+Ht4q    .req Ht2q
+Ht34q   .req Ht12q
+
+Ht3    .req Ht1
+Ht4    .req Ht2
+Ht34   .req Ht12
+
+rk0q   .req q18
+rk1q   .req q19
+rk2q   .req q20
+rk3q   .req q21
+rk4q   .req q22
+rk5q   .req q23
+rk6q   .req q24
+rk7q   .req q25
+rk8q   .req q26
+rk9q   .req q27
+rk10q  .req q28
+
+rk11q  .req q15
+rk12q  .req q16
+rk13q  .req q17
+rk14q  .req q2
+
+rk0    .req v18
+rk1    .req v19
+rk2    .req v20
+rk3    .req v21
+rk4    .req v22
+rk5    .req v23
+rk6    .req v24
+rk7    .req v25
+rk8    .req v26
+rk9    .req v27
+rk10   .req v28
+
+rk11   .req v15
+rk12   .req v16
+rk13   .req v17
+rk14   .req v2
+
+plain   .req v29
+plain_q .req q29
+
+rctr_inc   .req v30
+rtmp_ctr   .req v31
+rtmp_ctr_q .req q31
+
+tag    .req v11
+tag_q  .req q11
+
+#define UNROLL 4
+
+#define STACK_SIZE_GPRS  (6*16)
+#define STACK_SIZE_VREGS (4*16)
+#define STACK_SIZE  (STACK_SIZE_GPRS + STACK_SIZE_VREGS)
+
+#define STACK_BASE_GPRS  (0)
+#define STACK_BASE_VREGS (STACK_SIZE_GPRS)
+
+/********************************************************************/
+/*                 Generic preamble/postamble macros                */
+/********************************************************************/
+
+.macro save_vregs
+        stp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #(STACK_BASE_VREGS + 16*0)]
+        ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)]
+        ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)]
+        ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)]
+.endm
+
+.macro save_gprs
+        stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+.macro restore_gprs
+        ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)]
+        ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)]
+        ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)]
+        ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)]
+        ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)]
+        ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
+.endm
+
+// Derive number of iterations of unrolled loop and single-block loop
+.macro prepare_loop_counts
+        mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
+        lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
+        udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
+        msub remainder, count, unroll, full_blocks
+.endm
+
+/********************************************************************/
+/*                       AES related macros                         */
+/********************************************************************/
+
+.macro load_iv
+        ldr     rtmp_ctr_q, [ivec]
+
+        // set up counter increment
+	mov	constant_temp, #0x100000000
+	movi	rctr_inc.16b,  #0x0
+	fmov	rctr_inc.d[1], constant_temp
+
+        rev32   rtmp_ctr.16b, rtmp_ctr.16b
+.endm
+
+// Increase AES counter
+.macro aes_ctr_inc
+        add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
+.endm
+
+// Increase AES counter and initialize new AES state
+.macro next_ctr_init_aes aes_st
+        rev32  \aes_st\().16b, rtmp_ctr.16b
+        aes_ctr_inc
+.endm
+
+// A single AES round
+// Prevent SLOTHY from unfolding because uArchs tend to fuse AESMC+AESE
+.macro aesr data, key // @slothy:no-unfold=true
+        aese  \data, \key
+        aesmc \data, \data
+.endm
+
+.macro aesr_0_8 data, key
+        aesr \data\().16b, \key\()0.16b
+        aesr \data\().16b, \key\()1.16b
+        aesr \data\().16b, \key\()2.16b
+        aesr \data\().16b, \key\()3.16b
+        aesr \data\().16b, \key\()4.16b
+        aesr \data\().16b, \key\()5.16b
+        aesr \data\().16b, \key\()6.16b
+        aesr \data\().16b, \key\()7.16b
+        aesr \data\().16b, \key\()8.16b
+.endm
+
+.macro aesr_9_10 data, key
+        aesr \data\().16b, \key\()9.16b
+        aesr \data\().16b, \key\()10.16b
+.endm
+
+.macro aesr_11_12 data, key
+        aesr \data\().16b, \key\()11.16b
+        aesr \data\().16b, \key\()12.16b
+.endm
+
+// Destructs inA
+.macro eor3 out, inA, inB, inC
+        eor \inA, \inA, \inB
+        eor \out, \inA, \inC
+.endm
+
+.macro aesr_final aes_st, plain, out
+        aese \aes_st\().16b, rk11.16b
+        eor3 \out\().16b, \plain\().16b, rk12.16b, \aes_st\().16b
+.endm
+
+.macro aes_full_block aes_st, input, output
+        next_ctr_init_aes \aes_st
+        aesr_0_8   \aes_st\(), rk
+        aesr_9_10  \aes_st\(), rk
+        aesr_final \aes_st, \input, \output
+.endm
+
+.macro load_round_key i
+        ldr rk\i\()q, [key, #((\i)*16)]
+.endm
+
+.macro load_round_keys
+        load_round_key 0
+        load_round_key 1
+        load_round_key 2
+        load_round_key 3
+        load_round_key 4
+        load_round_key 5
+        load_round_key 6
+        load_round_key 7
+        load_round_key 8
+        load_round_key 9
+        load_round_key 10
+        load_round_key 11
+        load_round_key 12
+.endm
+
+/********************************************************************/
+/*       Loading of H-table (precomputed H-powers for GHASH)        */
+/********************************************************************/
+
+// This has to be synchronized with the H-table generation
+
+.macro load_h1 dst, dst_q
+        ldr \dst_q, [Htable]
+.endm
+
+.macro load_h2 dst, dst_q
+        ldr \dst_q, [Htable, #32]
+.endm
+
+.macro load_h3 dst, dst_q
+        ldr \dst_q, [Htable, #48]
+.endm
+
+.macro load_h4 dst, dst_q
+        ldr \dst_q, [Htable, #80]
+.endm
+
+.macro load_h12 dst, dst_q
+        ldr \dst_q, [Htable, #16]
+.endm
+
+.macro load_h34 dst, dst_q
+        ldr \dst_q, [Htable, #64]
+.endm
+
+.macro load_full_htable
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h12 Ht12, Ht12q
+        load_h34 Ht34, Ht34q
+.endm
+
+.macro load_htable_12
+        load_h1  Ht1,  Ht1q
+        load_h2  Ht2,  Ht2q
+        load_h12 Ht12, Ht12q
+.endm
+
+.macro load_htable_34
+        load_h3  Ht3,  Ht3q
+        load_h4  Ht4,  Ht4q
+        load_h34 Ht34, Ht34q
+.endm
+
+/********************************************************************/
+/*                    Macros for GHASH udpate                       */
+/********************************************************************/
+
+.macro ghash_init_0 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_mid.1q, ghash_tmp.1d, \Hk_mid\().1d
+.endm
+
+.macro ghash_init_1 input, Hk, Hk_mid, tag
+        rev64 \input\().16b, \input\().16b
+        eor   \input\().16b, \input\().16b, \tag\().16b
+        // Low product
+        pmull   ghash_lo.1q, \input\().1d, \Hk\().1d
+        // High product
+        pmull2  ghash_hi.1q, \input\().2d, \Hk\().2d
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
+.endm
+
+.macro ghash_acc_0 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        mov     ghash_tmp_d, \input\().d[1]
+        eor     ghash_tmp.8b, ghash_tmp.8b, \input\().8b
+        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_acc_1 input, Hk, Hk_mid
+        rev64   \input\().16b, \input\().16b
+
+        // Low product
+        pmull   ghash_tmp.1q, \input\().1d, \Hk\().1d
+        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
+        // High product
+        pmull2  ghash_tmp.1q, \input\().2d, \Hk\().2d
+        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
+        // Middle product
+        ext     ghash_tmp.16b, \input\().16b, \input\().16b, #8
+        eor     ghash_tmp.16b, ghash_tmp.16b, \input\().16b
+        pmull2  ghash_tmp.1q, ghash_tmp.2d, \Hk_mid\().2d
+        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
+.endm
+
+.macro ghash_finalize tag
+        eor        modulo_tmp0.16b, ghash_lo.16b,  ghash_hi.16b
+        pmull      modulo_tmp1.1q,  ghash_hi.1d,   ghash_mod.1d
+        ext        ghash_hi.16b,    ghash_hi.16b,  ghash_hi.16b, #8
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp0.16b
+        eor        modulo_tmp1.16b, ghash_hi.16b,  modulo_tmp1.16b
+        eor        ghash_mid.16b,   ghash_mid.16b, modulo_tmp1.16b
+        pmull      ghash_hi.1q,     ghash_mid.1d,  ghash_mod.1d
+        eor        ghash_lo.16b,    ghash_lo.16b,  ghash_hi.16b
+        ext        ghash_mid.16b,   ghash_mid.16b, ghash_mid.16b, #8
+        eor        \tag\().16b,     ghash_lo.16b,  ghash_mid.16b
+        ext        \tag\().16b, \tag\().16b, \tag\().16b, #8
+.endm
+
+.macro load_tag
+        ldr      tag_q, [tag_ptr]
+        rev64    tag.16b, tag.16b
+.endm
+
+.macro prepare_ghash
+        // Prepare constant for modular reduction
+        movi ghash_mod.8b, #0xc2
+        shl  ghash_mod_d, ghash_mod_d, #56
+.endm
+
+/********************************************************************/
+/*                            Core routine                          */
+/********************************************************************/
+
+.align        4
+_aes_gcm_enc_kernel_slothy_base_192:
+aes_gcm_enc_kernel_slothy_base_192:
+#ifdef BORINGSSL_DISPATCH_TEST
+        adrp  x9,_BORINGSSL_function_hit@PAGE
+        add   x9, x9, _BORINGSSL_function_hit@PAGEOFF
+        mov   w10, #1
+        strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
+#endif
+
+        AARCH64_VALID_CALL_TARGET
+        sub sp, sp, #STACK_SIZE
+
+Lenc_preamble_start:
+        save_gprs
+        save_vregs
+
+        lsr byte_len, len_bits, #3
+
+        load_round_keys
+        load_tag
+        load_iv
+
+        prepare_loop_counts
+        prepare_ghash
+
+Lenc_preamble_end:
+
+        cbz count, Lloop_unrolled_end
+Lloop_unrolled_start:
+
+        ldr plain_q, [input], #(4*16)
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #(4*16)
+
+        load_htable_34
+        ghash_init_1 res, Ht4, Ht34, tag
+
+        ldr plain_q, [input, #(-3*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-3*16)]
+
+        ghash_acc_0 res, Ht3, Ht34
+
+        ldr plain_q, [input, #(-2*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-2*16)]
+
+        load_htable_12
+        ghash_acc_1 res, Ht2, Ht12
+
+        ldr plain_q, [input, #(-1*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-1*16)]
+
+        ghash_acc_0 res, Ht1, Ht12
+
+        ghash_finalize tag
+
+        sub count, count, #1
+        cbnz count, Lloop_unrolled_start
+Lloop_unrolled_end:
+
+        load_htable_12
+
+        cbz remainder, Lloop_1x_end
+Lloop_1x_start:
+
+        ldr plain_q, [input], #16
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #16
+        ghash_init_0 res, Ht1, Ht12, tag
+
+        ghash_finalize tag
+
+        sub remainder, remainder, #1
+        cbnz remainder, Lloop_1x_start
+Lloop_1x_end:
+
+        // Return number of bytes processed
+        mov x0, byte_len
+        // Store new authentication tag
+        rev64 tag.16b, tag.16b
+        str tag_q, [tag_ptr]
+        // Store updated counter
+        rev32 rtmp_ctr.16b, rtmp_ctr.16b
+        str rtmp_ctr_q, [ivec]
+
+        restore_vregs
+        restore_gprs
+
+Lenc_postamble_end:
+        add sp, sp, #STACK_SIZE
+
+        ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__APPLE__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-192.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S
similarity index 78%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S
index 1a67fb97175..14274282d61 100644
--- a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_dual_acc.S
+++ b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_basic.S
@@ -74,17 +74,11 @@ full_blocks     .req x7
 remainder       .req x9
 unroll          .req x10
 
-aes_st0    .req v0
-aes_st0_q  .req q0
+aes_st    .req v0
+aes_st_q  .req q0
 
-aes_st1    .req v2
-aes_st1_q  .req q2
-
-res0       .req v0
-res0_q     .req q0
-
-res1       .req v2
-res1_q     .req q2
+res       .req v0
+res_q     .req q0
 
 ghash_hi    .req v9
 ghash_lo    .req v8
@@ -116,22 +110,6 @@ Ht3    .req Ht1
 Ht4    .req Ht2
 Ht34   .req Ht12
 
-Ht5q    .req Ht1q
-Ht6q    .req Ht2q
-Ht56q   .req Ht12q
-
-Ht5    .req Ht1
-Ht6    .req Ht2
-Ht56   .req Ht12
-
-Ht7q    .req Ht1q
-Ht8q    .req Ht2q
-Ht78q   .req Ht12q
-
-Ht7    .req Ht1
-Ht8    .req Ht2
-Ht78   .req Ht12
-
 rk0q   .req q18
 rk1q   .req q19
 rk2q   .req q20
@@ -147,7 +125,7 @@ rk10q  .req q28
 rk11q  .req q15
 rk12q  .req q16
 rk13q  .req q17
-rk14q  .req q1
+rk14q  .req q2
 
 rk0    .req v18
 rk1    .req v19
@@ -164,7 +142,7 @@ rk10   .req v28
 rk11   .req v15
 rk12   .req v16
 rk13   .req v17
-rk14   .req v1
+rk14   .req v2
 
 plain   .req v29
 plain_q .req q29
@@ -221,10 +199,14 @@ tag_q  .req q11
         ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)]
 .endm
 
+// Derive number of iterations of unrolled loop and single-block loop
 .macro prepare_loop_counts
         mov  unroll, #UNROLL
+        // Number of AES Blocks (16b each)
         lsr  full_blocks, byte_len, #4
+        // Number of iterations of the unrolled loop
         udiv count, full_blocks, unroll
+        // Number of iterations for the tail loop handling 1 block each
         msub remainder, count, unroll, full_blocks
 .endm
 
@@ -235,17 +217,20 @@ tag_q  .req q11
 .macro load_iv
         ldr     rtmp_ctr_q, [ivec]
 
-	mov	constant_temp, #0x100000000   // set up counter increment
+        // set up counter increment
+	mov	constant_temp, #0x100000000
 	movi	rctr_inc.16b,  #0x0
 	fmov	rctr_inc.d[1], constant_temp
 
         rev32   rtmp_ctr.16b, rtmp_ctr.16b
 .endm
 
+// Increase AES counter
 .macro aes_ctr_inc
         add    rtmp_ctr.4s, rtmp_ctr.4s, rctr_inc.4s
 .endm
 
+// Increase AES counter and initialize new AES state
 .macro next_ctr_init_aes aes_st
         rev32  \aes_st\().16b, rtmp_ctr.16b
         aes_ctr_inc
@@ -295,7 +280,7 @@ tag_q  .req q11
         next_ctr_init_aes \aes_st
         aesr_0_8   \aes_st\(), rk
         aesr_9_10  \aes_st\(), rk
-        aesr_11_12  \aes_st\(), rk
+        aesr_11_12 \aes_st\(), rk
         aesr_final \aes_st, \input, \output
 .endm
 
@@ -343,22 +328,6 @@ tag_q  .req q11
         ldr \dst_q, [Htable, #80]
 .endm
 
-.macro load_h5 dst, dst_q
-        ldr \dst_q, [Htable, #96]
-.endm
-
-.macro load_h6 dst, dst_q
-        ldr \dst_q, [Htable, #128]
-.endm
-
-.macro load_h7 dst, dst_q
-        ldr \dst_q, [Htable, #144]
-.endm
-
-.macro load_h8 dst, dst_q
-        ldr \dst_q, [Htable, #176]
-.endm
-
 .macro load_h12 dst, dst_q
         ldr \dst_q, [Htable, #16]
 .endm
@@ -367,24 +336,13 @@ tag_q  .req q11
         ldr \dst_q, [Htable, #64]
 .endm
 
-.macro load_h56 dst, dst_q
-        ldr \dst_q, [Htable, #112]
-.endm
-
-.macro load_h78 dst, dst_q
-        ldr \dst_q, [Htable, #160]
-.endm
-
 .macro load_full_htable
         load_h1  Ht1,  Ht1q
         load_h2  Ht2,  Ht2q
         load_h3  Ht3,  Ht3q
         load_h4  Ht4,  Ht4q
-        load_h5  Ht5,  Ht5q
-        load_h6  Ht6,  Ht6q
         load_h12 Ht12, Ht12q
         load_h34 Ht34, Ht34q
-        load_h56 Ht56, Ht56q
 .endm
 
 .macro load_htable_12
@@ -399,68 +357,10 @@ tag_q  .req q11
         load_h34 Ht34, Ht34q
 .endm
 
-.macro load_htable_56
-        load_h5  Ht5,  Ht5q
-        load_h6  Ht6,  Ht6q
-        load_h56 Ht56, Ht56q
-.endm
-
-.macro load_htable_78
-        load_h7  Ht7,  Ht7q
-        load_h8  Ht8,  Ht8q
-        load_h78 Ht78, Ht78q
-.endm
-
 /********************************************************************/
 /*                    Macros for GHASH udpate                       */
 /********************************************************************/
 
-.macro ghash_init_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-        eor   \inputa\().16b, \inputa\().16b, tag.16b
-
-        // Low product
-        pmull   ghash_lo.1q,  \inputa\().1d, \Ha\().1d
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_hi.1q, \inputa\().2d, \Ha\().2d
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  ghash_mid.1q, ghash_tmp.2d, \Hk_mid\().2d
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
-.endm
-
-.macro ghash_acc_pair inputa, inputb, Ha, Hb, Hk_mid
-        rev64 \inputa\().16b, \inputa\().16b
-        rev64 \inputb\().16b, \inputb\().16b
-
-        // Low product
-        pmull   ghash_tmp.1q, \inputa\().1d, \Ha\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        pmull   ghash_tmp.1q, \inputb\().1d, \Hb\().1d
-        eor     ghash_lo.16b, ghash_lo.16b, ghash_tmp.16b
-        // High product
-        pmull2  ghash_tmp.1q, \inputa\().2d, \Ha\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        pmull2  ghash_tmp.1q, \inputb\().2d, \Hb\().2d
-        eor     ghash_hi.16b, ghash_hi.16b, ghash_tmp.16b
-        // Middle product
-        trn1    ghash_tmp.2d,  \inputb\().2d, \inputa\().2d
-        trn2    \inputb\().2d, \inputb\().2d, \inputa\().2d
-        eor     ghash_tmp.16b, ghash_tmp.16b, \inputb\().16b
-        pmull2  \inputa\().1q, ghash_tmp.2d, \Hk_mid\().2d
-        eor     ghash_mid.16b, ghash_mid.16b, \inputa\().16b
-        pmull   ghash_tmp.1q, ghash_tmp.1d, \Hk_mid\().1d
-        eor     ghash_mid.16b, ghash_mid.16b, ghash_tmp.16b
-.endm
-
 .macro ghash_init_0 input, Hk, Hk_mid, tag
         rev64 \input\().16b, \input\().16b
         eor   \input\().16b, \input\().16b, \tag\().16b
@@ -557,7 +457,7 @@ aes_gcm_enc_kernel_slothy_base_256:
         strb  w10, [x9,#2] // kFlag_aes_gcm_enc_kernel
 #endif
 
-        AARCH64_SIGN_LINK_REGISTER
+        AARCH64_VALID_CALL_TARGET
         sub sp, sp, #STACK_SIZE
 
 Lenc_preamble_start:
@@ -578,29 +478,31 @@ Lenc_preamble_end:
         cbz count, Lloop_unrolled_end
 Lloop_unrolled_start:
 
-        load_round_key 14
-
         ldr plain_q, [input], #(4*16)
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #(4*16)
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #(4*16)
+
+        load_htable_34
+        ghash_init_1 res, Ht4, Ht34, tag
 
         ldr plain_q, [input, #(-3*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-3*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-3*16)]
 
-        load_htable_34
-        ghash_init_pair res0, res1, Ht4, Ht3, Ht34
+        ghash_acc_0 res, Ht3, Ht34
 
         ldr plain_q, [input, #(-2*16)]
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output, #(-2*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-2*16)]
+
+        load_htable_12
+        ghash_acc_1 res, Ht2, Ht12
 
         ldr plain_q, [input, #(-1*16)]
-        aes_full_block aes_st1, plain, res1
-        str res1_q, [output, #(-1*16)]
+        aes_full_block aes_st, plain, res
+        str res_q, [output, #(-1*16)]
 
-        load_htable_12
-        ghash_acc_pair res0, res1, Ht2, Ht1, Ht12
+        ghash_acc_0 res, Ht1, Ht12
 
         ghash_finalize tag
 
@@ -613,12 +515,10 @@ Lloop_unrolled_end:
         cbz remainder, Lloop_1x_end
 Lloop_1x_start:
 
-        load_round_key 14
-
         ldr plain_q, [input], #16
-        aes_full_block aes_st0, plain, res0
-        str res0_q, [output], #16
-        ghash_init_0 res0, Ht1, Ht12, tag
+        aes_full_block aes_st, plain, res
+        str res_q, [output], #16
+        ghash_init_0 res, Ht1, Ht12, tag
 
         ghash_finalize tag
 
@@ -641,7 +541,6 @@ Lloop_1x_end:
 Lenc_postamble_end:
         add sp, sp, #STACK_SIZE
 
-        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 #endif
diff --git a/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-slothy-256.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/clean/aesv8-gcm-armv8-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/clean/enc/aesv8-gcm-armv8-enc-base-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_basic.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_basic.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_basic.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_dual_acc_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_dual_acc_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_ilp.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_ilp.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_ilp.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_ilp.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable_rotate.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable_rotate.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_keep_htable_rotate.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_keep_htable_rotate.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_full.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_full.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_full.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_full.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv2_mem_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem2_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_keep_htable_scalar_rk_v1.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_ilp_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_ilp_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_ilp_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_ilp_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_late_tag.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_late_tag.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_late_tag.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_late_tag.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-128_x8_scalar_iv_mem_late_tag_scalar_rk_manual_rotate.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_basic.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_basic.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_basic.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-192_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_basic.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_basic.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_basic.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_basic.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_dual_acc.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_dual_acc.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_dual_acc.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_dual_acc.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_reload_round_keys_partial.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_reload_round_keys_partial.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_reload_round_keys_partial.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_reload_round_keys_partial.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S b/crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
similarity index 100%
rename from crypto/fipsmodule/modes/asm/slothy/opt/aesv8-gcm-armv8-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
rename to crypto/fipsmodule/modes/asm/slothy/opt/enc/aesv8-gcm-armv8-enc-opt-256_x4_scalar_iv_mem_late_tag_scalar_rk.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/optimize.sh b/crypto/fipsmodule/modes/asm/slothy/optimize.sh
index e41823c6a2d..cfbb6dfabae 100755
--- a/crypto/fipsmodule/modes/asm/slothy/optimize.sh
+++ b/crypto/fipsmodule/modes/asm/slothy/optimize.sh
@@ -24,6 +24,17 @@ if [ "$SZ" = "" ]; then
     echo "No keysize specified -- defaulting to 128 bit"
 fi
 
+if [ "$ENC" = "" ]; then
+    echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)."
+    ENC=1
+fi
+
+if [ "$ENC" = "1" ]; then
+    ENCDEC="enc"
+else
+    ENCDEC="dec"
+fi
+
 if [ "$AWS_LC_BASE" = "" ]; then
     # Oof... bit gross
     AWS_LC_BASE=$(dirname $(dirname $(dirname $(dirname $(dirname $(pwd))))))
@@ -32,9 +43,9 @@ fi
 
 BUILD_DIR=build_release
 
-CLEAN_STEM=aesv8-gcm-armv8-base-${SZ}
-OPT_STEM=aesv8-gcm-armv8-opt-${SZ}
-TMP_STEM=aesv8-gcm-armv8-tmp-${SZ}
+CLEAN_STEM=aesv8-gcm-armv8-${ENCDEC}-base-${SZ}
+OPT_STEM=aesv8-gcm-armv8-${ENCDEC}-opt-${SZ}
+TMP_STEM=aesv8-gcm-armv8-${ENCDEC}-tmp-${SZ}
 
 UARCH=${UARCH:=N1}
 if [ $UARCH = "N1" ]; then
@@ -46,46 +57,23 @@ else
     exit 1
 fi
 
-if [ $SZ = "128" ]; then
-    VARIANTS_ALL="
-    x4_basic
-    x4_late_tag
-    x4_ilp
-    x4_dual_acc
-    x4_dual_acc_keep_htable
-    x4_keep_htable
-    x4_keep_htable_rotate
-    x4_reload_round_keys_partial
-    x4_reload_round_keys_full
-    x4_scalar_iv
-    x4_scalar_iv_mem
-    x4_scalar_iv_mem_late_tag
-    x4_scalar_iv_mem_late_tag_keep_htable
-    x6_basic
-    x8_basic
-    x6_ilp
-    x8_ilp
-    x8_ilp_dual_acc
-    x8_ilp_rotate
-    x8_ilp_rotate_dual_acc
-    x8_ilp_rotate_manual_eor3
-    x8_reload
-    x8_reload_ldp_stp
-    x8_reload_ldp_stp_dual_acc
-    x8_reload_ldp_stp_simpler
-    x8_reload_ldp_stp_simpler_manual_rotate
-    "
-elif [ $SZ = "192" ]; then
-    VARIANTS_ALL="
-    x4_basic
-    x4_reload_round_keys_partial
-    "
-else
-    VARIANTS_ALL="
-    x4_basic
-    x4_reload_round_keys_partial
-    "
-fi
+list_variants() {
+    SZ=$1
+    UNROLL=$2
+    DIR=$3
+    VARIANTS=$((ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ') 2>/dev/null || echo "")
+    echo $VARIANTS
+}
+
+VARIANTS_ALL=""
+for UNROLL in x4 x6 x8
+do
+    for V in $(list_variants $SZ $UNROLL "clean/${ENCDEC}");
+    do
+    VARIANTS_ALL="$VARIANTS_ALL
+      ${UNROLL}_$V"
+    done
+done
 
 VERBOSE=${VERBOSE:=0}
 TIMEOUT=${TIMEOUT:=1200} # 20min timeout by default
@@ -181,8 +169,8 @@ optimize_generic() {
 
 optimize_variant() {
     echo "Optimizing variant $1 ..."
-    INFILE=$CLEAN_DIR/${CLEAN_STEM}_$1.S
-    OUTFILE=$OPT_DIR/${OPT_STEM}_$1.S
+    INFILE=$CLEAN_DIR/${ENCDEC}/${CLEAN_STEM}_$1.S
+    OUTFILE=$OPT_DIR/${ENCDEC}/${OPT_STEM}_$1.S
     TMP0=$TMP_DIR/${TMP_STEM}_$1_0.S
     TMP1=$TMP_DIR/${TMP_STEM}_$1_1.S
     TMP2=$TMP_DIR/${TMP_STEM}_$1_2.S
diff --git a/crypto/fipsmodule/modes/asm/slothy/stats.sh b/crypto/fipsmodule/modes/asm/slothy/stats.sh
index b283b6d26fb..27b2b5b5d98 100755
--- a/crypto/fipsmodule/modes/asm/slothy/stats.sh
+++ b/crypto/fipsmodule/modes/asm/slothy/stats.sh
@@ -36,13 +36,25 @@ if [ "$UNROLL_ARG" = "" ]; then
     echo "No unrolling level specified UNROLL_ARG -- defaulting to ${UNROLL_ARG}"
 fi
 
+if [ "$ENC" = "" ]; then
+    echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)."
+    ENC=1
+fi
+
+if [ "$ENC" = "1" ]; then
+    ENCDEC="enc"
+else
+    ENCDEC="dec"
+fi
+
+
 LOOP_LABEL="Lloop_unrolled_start:"
 
 list_variants() {
     SZ=$1
     UNROLL=$2
     DIR=$3
-    VARIANTS=$(ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ' )
+    VARIANTS=$(ls -1 ./${DIR}/${ENCDEC}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ' )
     echo $VARIANTS
 }
 
@@ -53,7 +65,7 @@ get_benchmark_for() {
     UNROLL=$2
     DIR=$3
     VARIANT=$4
-    ID="${DIR}/${SZ}_${UNROLL}_${VARIANT}"
+    ID="${DIR}/${ENCDEC}/${SZ}_${UNROLL}_${VARIANT}"
     cat $BENCHMARKS                                                                            \
         | grep "Testing variant: ${ID}" -A 10                                                  \
         | grep "MB/s"                                                                          \
@@ -78,8 +90,8 @@ get_slothy_stats_for() {
     else
         TY="base"
     fi
-    FILEBASE="aesv8-gcm-armv8"
-    FILE="${DIR}/${FILEBASE}-${TY}-${SZ}_${UNROLL}_${VARIANT}.S"
+    FILEBASE="aesv8-gcm-armv8-enc"
+    FILE="${DIR}/${ENCDEC}/${FILEBASE}-${TY}-${SZ}_${UNROLL}_${VARIANT}.S"
     cat $FILE                                        \
         | grep "${LOOP_LABEL}" -A 10                 \
         | sed -n 's/[^0-9]*\([0-9][0-9.]*\).*/\1/p'  \
@@ -91,7 +103,7 @@ get_stats_for() {
     UNROLL=$2
     DIR=$3
     VARIANT=$4
-    ID="${DIR}/${SZ}_${UNROLL}_${VARIANT}"
+    ID="${DIR}/${ENCDEC}/${SZ}_${UNROLL}_${VARIANT}"
     BENCH=$(get_benchmark_for $1 $2 $3 $4)
     if [ "$DIR" = "opt" ]; then
         SLOTHY=$(get_slothy_stats_for $1 $2 $3 $4)
diff --git a/crypto/fipsmodule/modes/asm/slothy/test.sh b/crypto/fipsmodule/modes/asm/slothy/test.sh
index fa5408e9f14..fce132b022a 100755
--- a/crypto/fipsmodule/modes/asm/slothy/test.sh
+++ b/crypto/fipsmodule/modes/asm/slothy/test.sh
@@ -3,7 +3,7 @@
 # Build and test AES-GCM variants
 #
 # Usage:
-# > [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant]
+# > [ENC=0/1] [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant]
 #
 # This script tests that the assembly files in clean/ or opt/ can be used as drop-in
 # replacements for the default aesv8-gcm-armv8-base-{128,192,256}
@@ -34,6 +34,17 @@ else
     OPT_STR="opt"
 fi
 
+if [ "$ENC" = "" ]; then
+    echo "Environment variable ENC not set. Defaulting to ENC=1 (encryption)."
+    ENC=1
+fi
+
+if [ "$ENC" = "1" ]; then
+    ENCDEC="enc"
+else
+    ENCDEC="dec"
+fi
+
 if [ "$VERBOSE" = "" ]; then
     VERBOSE=0
     echo "Environment variable VERBOSE not set. Defaulting to VERBOSE=0 (silent mode)."
@@ -48,16 +59,17 @@ TIMEOUT=5 # Run tests for 5 seconds -- they often hang upon a bug
 KEEP_GOING=${KEEP_GOING:=0}
 
 ASM_DIR=../
-AES_SLOTHY_ASM=aesv8-gcm-armv8-slothy-${SZ}.S
 
 if [ "$OPT" = "0" ]; then
-    DIR=./clean
-    FILE_STEM=aesv8-gcm-armv8-base-${SZ}
+    DIR=./clean/${ENCDEC}
+    FILE_STEM=aesv8-gcm-armv8-${ENCDEC}-base-${SZ}
 else
-    DIR=./opt
-    FILE_STEM=aesv8-gcm-armv8-opt-${SZ}
+    DIR=./opt/${ENCDEC}
+    FILE_STEM=aesv8-gcm-armv8-${ENCDEC}-opt-${SZ}
 fi
 
+AES_SLOTHY_ASM=aesv8-gcm-armv8-${ENCDEC}-slothy-${SZ}.S
+
 set_variant() {
     cp $DIR/${FILE_STEM}_$1.S $ASM_DIR/$AES_SLOTHY_ASM
 }
@@ -84,7 +96,7 @@ bench_variant() {
 }
 
 do_variant() {
-    echo "* Testing variant: ${OPT_STR}/${SZ}_$1"
+    echo "* Testing variant: ${OPT_STR}/${ENCDEC}/${SZ}_$1"
     printf " - Copy... "
     set_variant $1
     printf "OK!\n"
@@ -115,50 +127,26 @@ do_variant() {
     fi
 }
 
+list_variants() {
+    SZ=$1
+    UNROLL=$2
+    DIR=$3
+    VARIANTS=$((ls -1 ./${DIR}/*${SZ}*${UNROLL}*.S | sed -n 's/.*'"${UNROLL}"'_\(.*\)\.S/\1/p' | tr '\n' ' ') 2>/dev/null || echo "")
+    echo $VARIANTS
+}
 
-if [ $SZ = "128" ]; then
-    VARIANTS="
-    x4_basic
-    x4_late_tag
-    x4_ilp
-    x4_dual_acc
-    x4_dual_acc_keep_htable
-    x4_keep_htable
-    x4_keep_htable_rotate
-    x4_reload_round_keys_partial
-    x4_reload_round_keys_full
-    x4_scalar_iv
-    x4_scalar_iv_mem
-    x4_scalar_iv_mem_late_tag
-    x4_scalar_iv_mem_late_tag_keep_htable
-    x6_basic
-    x8_basic
-    x6_ilp
-    x8_ilp
-    x8_ilp_dual_acc
-    x8_ilp_rotate
-    x8_ilp_rotate_dual_acc
-    x8_ilp_rotate_manual_eor3
-    x8_reload
-    x8_reload_ldp_stp
-    x8_reload_ldp_stp_dual_acc
-    x8_reload_ldp_stp_simpler
-    x8_reload_ldp_stp_simpler_manual_rotate
-    "
-elif [ $SZ = "192" ]; then
-    VARIANTS="
-    x4_basic
-    x4_reload_round_keys_partial
-    "
-else
-    VARIANTS="
-    x4_basic
-    x4_reload_round_keys_partial
-    "
-fi
+VARIANTS=""
+for UNROLL in x4 x6 x8
+do
+    for V in $(list_variants $SZ $UNROLL $DIR);
+    do
+    VARIANTS="$VARIANTS
+      ${UNROLL}_$V"
+    done
+done
 
 if [ "$1" = "--help" ]; then
-    echo "Usage: [VERBOSE=0/1] [OPT=0/1] test.sh [variant]"
+    echo "Usage: [ENC=0/1] [BENCH=0/1] [AWS_LC_BASE=PATH] [BUILD_DIR=DIRNAME] [VERBOSE=0/1] [OPT=0/1] test.sh [variant]"
     echo "Valid values for 'variant' are:"
     for var in $VARIANTS; do
         echo "* $var"
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 64717bf4cfe..756aa995633 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -161,9 +161,9 @@ static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
   // in the case of the EVP API.
   // In the case of the AEAD API, it can be used for all input lengths
   // but we are not identifying which API calls the code below.
-  #define USE_SLOTHY_AES_GCM_128 1
+  #define USE_SLOTHY_AES_GCM_ENC_128
 
-#if defined(USE_SLOTHY_AES_GCM_128)
+#if defined(USE_SLOTHY_AES_GCM_ENC_128)
   if (key->rounds == 10) {
       aes_gcm_enc_kernel_slothy_base_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
   }
@@ -212,6 +212,20 @@ static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
   // in the case of the EVP API.
   // In the case of the AEAD API, it can be used for all input lengths
   // but we are not identifying which API calls the code below.
+  #define USE_SLOTHY_AES_GCM_DEC_128
+
+#if defined(USE_SLOTHY_AES_GCM_DEC_128)
+  if (key->rounds == 10) {
+      aes_gcm_dec_kernel_slothy_base_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+  }
+  else if (key->rounds == 12) {
+      aes_gcm_dec_kernel_slothy_base_192(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+  }
+  else if (key->rounds == 14) {
+      aes_gcm_dec_kernel_slothy_base_256(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+  }
+  else
+#endif
   if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) {
     switch(key->rounds) {
     case 10:
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index 0f91fc335db..2380376d072 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -371,6 +371,19 @@ void aes_gcm_enc_kernel_slothy_base_256(const uint8_t *in, uint64_t in_bits, voi
                         void *Xi, uint8_t *ivec, const AES_KEY *key,
                         const u128 Htable[16]);
 
+void aes_gcm_dec_kernel_slothy_base_128(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key,
+                        const u128 Htable[16]);
+
+void aes_gcm_dec_kernel_slothy_base_192(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key,
+                        const u128 Htable[16]);
+
+void aes_gcm_dec_kernel_slothy_base_256(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key,
+                        const u128 Htable[16]);
+
+
 // These functions are defined in aesv8-gcm-armv8-unroll8.pl.
 // They take input length in BITS and return number of BYTES processed.
 size_t aesv8_gcm_8x_enc_128(const uint8_t *in, size_t bit_len, uint8_t *out,