diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S b/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S index 2bc4d332c9..b3588047d8 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S @@ -312,17 +312,13 @@ _aesv8_gcm_8x_enc_128: L128_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -353,9 +349,7 @@ L128_enc_main_loop: //main loop start eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h3l | h3h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 @@ -450,9 +444,7 @@ L128_enc_main_loop: //main loop start aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low @@ -666,15 +658,11 @@ L128_enc_main_loop: //main loop start L128_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k rev64 v9.16b, v9.16b //GHASH block 8k+1 @@ -752,9 +740,7 @@ L128_enc_prepretail: //PREPRETAIL pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v5.16b, v27.16b @@ -806,9 +792,7 @@ L128_enc_prepretail: //PREPRETAIL ldp q27, q28, [x11, #64] //load rk4, rk5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 @@ -986,16 +970,12 @@ L128_enc_tail: //TAIL mov v29.16b, v27.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .long 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 cmp x5, #112 b.gt L128_enc_blocks_more_than_7 @@ -1162,7 +1142,6 @@ L128_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block @@ -1199,7 +1178,6 @@ L128_enc_blocks_more_than_2: //blocks left > 2 ins v27.d[0], v8.d[1] //GHASH final-2 block - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid @@ -1219,7 +1197,6 @@ L128_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -1281,7 +1258,6 @@ L128_enc_blocks_less_than_1: //blocks left <= 1 eor v16.8b, v16.8b, v8.8b //GHASH final block - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid @@ -1619,9 +1595,7 @@ _aesv8_gcm_8x_dec_128: L128_dec_main_loop: //main loop start ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k @@ -1629,9 +1603,7 @@ L128_dec_main_loop: //main loop start rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v5.16b, v30.16b //CTR block 8k+13 @@ -1697,9 +1669,7 @@ L128_dec_main_loop: //main loop start pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -1727,9 +1697,7 @@ L128_dec_main_loop: //main loop start aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v1.16b, v28.16b @@ -1977,17 +1945,13 @@ L128_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #0] //load rk0, rk1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v9.16b, v9.16b //GHASH block 8k+1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v13.16b, v13.16b //GHASH block 8k+5 rev64 v12.16b, v12.16b //GHASH block 8k+4 @@ -2060,9 +2024,7 @@ L128_dec_prepretail: //PREPRETAIL pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -2075,9 +2037,7 @@ L128_dec_prepretail: //PREPRETAIL aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v0.16b, v28.16b @@ -2283,16 +2243,12 @@ L128_dec_tail: //TAIL cmp x5, #112 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result b.gt L128_dec_blocks_more_than_7 @@ -2461,7 +2417,6 @@ L128_dec_blocks_more_than_3: //blocks left > 3 ins v27.d[0], v8.d[1] //GHASH final-3 block - mid ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldr q24, [x6, #64] //load h4k | h3k eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid @@ -2488,7 +2443,6 @@ L128_dec_blocks_more_than_2: //blocks left > 2 eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-2 block - mid @@ -2513,7 +2467,6 @@ L128_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -2560,7 +2513,6 @@ L128_dec_blocks_less_than_1: //blocks left <= 1 mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits @@ -2963,16 +2915,12 @@ L192_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -3074,9 +3022,7 @@ L192_enc_main_loop: //main loop start aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid @@ -3122,9 +3068,7 @@ L192_enc_main_loop: //main loop start aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b @@ -3351,9 +3295,7 @@ L192_enc_prepretail: //PREPRETAIL add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -3365,9 +3307,7 @@ L192_enc_prepretail: //PREPRETAIL rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v7.16b, v30.16b //CTR block 8k+15 @@ -3475,17 +3415,13 @@ L192_enc_prepretail: //PREPRETAIL aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) @@ -3695,19 +3631,15 @@ L192_enc_prepretail: //PREPRETAIL L192_enc_tail: //TAIL ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 cmp x5, #112 .long 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -3883,7 +3815,6 @@ L192_enc_blocks_more_than_3: //blocks left > 3 ldr q9, [x0], #16 //AES final-2 block - load plaintext ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ins v27.d[0], v8.d[1] //GHASH final-3 block - mid @@ -3906,7 +3837,6 @@ L192_enc_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -3929,7 +3859,6 @@ L192_enc_blocks_more_than_2: //blocks left > 2 L192_enc_blocks_more_than_1: //blocks left > 1 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result rev64 v8.16b, v9.16b //GHASH final-1 block @@ -3975,7 +3904,6 @@ L192_enc_blocks_less_than_1: //blocks left <= 1 mov v0.d[1], x14 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[0], x13 //ctr0b is mask for last block @@ -4380,9 +4308,7 @@ L192_dec_main_loop: //main loop start add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v11.16b, v11.16b //GHASH block 8k+3 @@ -4420,9 +4346,7 @@ L192_dec_main_loop: //main loop start aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 @@ -4477,9 +4401,7 @@ L192_dec_main_loop: //main loop start aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v26.16b @@ -4538,9 +4460,7 @@ L192_dec_main_loop: //main loop start aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v5.16b, v28.16b @@ -4765,9 +4685,7 @@ L192_dec_prepretail: //PREPRETAIL add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -4780,9 +4698,7 @@ L192_dec_prepretail: //PREPRETAIL rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v0.16b, v26.16b @@ -4889,16 +4805,12 @@ L192_dec_prepretail: //PREPRETAIL aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 @@ -5111,17 +5023,13 @@ L192_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -5283,7 +5191,6 @@ L192_dec_blocks_more_than_4: //blocks left > 4 L192_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block ldr q9, [x0], #16 //AES final-2 block - load ciphertext @@ -5312,7 +5219,6 @@ L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -5339,7 +5245,6 @@ L192_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load ciphertext ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in @@ -5383,7 +5288,6 @@ L192_dec_blocks_less_than_1: //blocks left <= 1 csel x13, x8, x7, lt csel x14, x7, xzr, lt ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 mov v0.d[1], x14 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored @@ -5825,9 +5729,7 @@ L256_enc_main_loop: //main loop start rev64 v11.16b, v11.16b //GHASH block 8k+3 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -5837,9 +5739,7 @@ L256_enc_main_loop: //main loop start rev64 v12.16b, v12.16b //GHASH block 8k+4 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 @@ -5940,9 +5840,7 @@ L256_enc_main_loop: //main loop start pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid .long 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high @@ -6026,9 +5924,7 @@ L256_enc_main_loop: //main loop start aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid @@ -6290,16 +6186,12 @@ L256_enc_prepretail: //PREPRETAIL aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v5.16b, v27.16b @@ -6415,9 +6307,7 @@ L256_enc_prepretail: //PREPRETAIL ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -6459,9 +6349,7 @@ L256_enc_prepretail: //PREPRETAIL pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v1.16b, v27.16b @@ -6635,18 +6523,14 @@ L256_enc_prepretail: //PREPRETAIL L256_enc_tail: //TAIL ldp q24, q25, [x6, #160] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - load plaintext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 mov v29.16b, v28.16b cmp x5, #112 @@ -6816,7 +6700,6 @@ L256_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -6842,7 +6725,6 @@ L256_enc_blocks_more_than_3: //blocks left > 3 L256_enc_blocks_more_than_2: //blocks left > 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-2 block - store result @@ -6872,7 +6754,6 @@ L256_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -6916,7 +6797,6 @@ L256_enc_blocks_less_than_1: //blocks left <= 1 mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[1], x14 @@ -7355,9 +7235,7 @@ L256_dec_main_loop: //main loop start rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -7392,9 +7270,7 @@ L256_dec_main_loop: //main loop start eor v8.16b, v8.16b, v19.16b //PRE 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -7547,9 +7423,7 @@ L256_dec_main_loop: //main loop start .long 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v14.16b, v14.16b //GHASH block 8k+6 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid @@ -7560,9 +7434,7 @@ L256_dec_main_loop: //main loop start ldp q28, q26, [x11, #128] //load rk8, rk9 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 @@ -7790,17 +7662,13 @@ L256_dec_prepretail: //PREPRETAIL ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 @@ -7915,9 +7783,7 @@ L256_dec_prepretail: //PREPRETAIL .long 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 @@ -7959,9 +7825,7 @@ L256_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v15.16b, v15.16b //GHASH block 8k+7 rev64 v13.16b, v13.16b //GHASH block 8k+5 @@ -8173,16 +8037,12 @@ L256_dec_tail: //TAIL ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v28.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 b.gt L256_dec_blocks_more_than_7 mov v7.16b, v6.16b @@ -8341,7 +8201,6 @@ L256_dec_blocks_more_than_4: //blocks left > 4 L256_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8371,7 +8230,6 @@ L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q9, [x0], #16 //AES final-1 block - load ciphertext eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8399,7 +8257,6 @@ L256_dec_blocks_more_than_1: //blocks left > 1 ins v27.d[0], v8.d[1] //GHASH final-1 block - mid ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load ciphertext @@ -8447,7 +8304,6 @@ L256_dec_blocks_less_than_1: //blocks left <= 1 and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing rev64 v8.16b, v9.16b //GHASH final block diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S b/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S index aac8e5077d..a361edf733 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S @@ -83,7 +83,6 @@ _aes_gcm_enc_kernel: aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b @@ -92,14 +91,12 @@ _aes_gcm_enc_kernel: aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 @@ -142,7 +139,6 @@ _aes_gcm_enc_kernel: aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 @@ -843,15 +839,12 @@ _aes_gcm_dec_kernel: aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 @@ -871,7 +864,6 @@ _aes_gcm_dec_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/ghashv8-armx.S b/generated-src/ios-aarch64/crypto/fipsmodule/ghashv8-armx.S index bcba8dd2a0..20ce8085ee 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/ghashv8-armx.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/ghashv8-armx.S @@ -30,13 +30,14 @@ _gcm_init_v8: and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H + ext v20.16b, v20.16b, v20.16b, #8 st1 {v20.2d},[x0],#16 //store Htable[0] - //calculate H^2 + //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing - pmull v0.1q,v20.1d,v20.1d + pmull2 v0.1q,v20.2d,v20.2d eor v16.16b,v16.16b,v20.16b - pmull2 v2.1q,v20.2d,v20.2d + pmull v2.1q,v20.1d,v20.1d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing @@ -52,17 +53,19 @@ _gcm_init_v8: ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b - eor v22.16b,v0.16b,v18.16b + eor v17.16b,v0.16b,v18.16b - ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + ext v22.16b,v17.16b,v17.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + st1 {v21.2d},[x0],#16 //store Htable[1..2] + st1 {v22.2d},[x0],#16 //store Htable[1..2] + //calculate H^3 and H^4 - pmull v0.1q,v20.1d, v22.1d - pmull v5.1q,v22.1d,v22.1d - pmull2 v2.1q,v20.2d, v22.2d - pmull2 v7.1q,v22.2d,v22.2d + pmull2 v0.1q,v20.2d, v22.2d + pmull2 v5.1q,v22.2d,v22.2d + pmull v2.1q,v20.1d, v22.1d + pmull v7.1q,v22.1d,v22.1d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d @@ -91,11 +94,11 @@ _gcm_init_v8: eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v23.16b, v0.16b,v18.16b //H^3 - eor v25.16b,v5.16b,v4.16b //H^4 + eor v16.16b, v0.16b,v18.16b //H^3 + eor v17.16b, v5.16b,v4.16b //H^4 - ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing - ext v17.16b,v25.16b,v25.16b,#8 + ext v23.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v25.16b,v17.16b,v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v23.16b eor v17.16b,v17.16b,v25.16b @@ -104,10 +107,10 @@ _gcm_init_v8: st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] //calculate H^5 and H^6 - pmull v0.1q,v22.1d, v23.1d - pmull v5.1q,v23.1d,v23.1d - pmull2 v2.1q,v22.2d, v23.2d - pmull2 v7.1q,v23.2d,v23.2d + pmull2 v0.1q,v22.2d, v23.2d + pmull2 v5.1q,v23.2d,v23.2d + pmull v2.1q,v22.1d, v23.1d + pmull v7.1q,v23.1d,v23.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v16.1d,v16.1d @@ -135,11 +138,12 @@ _gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v26.16b,v0.16b,v18.16b //H^5 - eor v28.16b,v5.16b,v4.16b //H^6 - ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing - ext v17.16b,v28.16b,v28.16b,#8 + eor v16.16b,v0.16b,v18.16b //H^5 + eor v17.16b,v5.16b,v4.16b //H^6 + + ext v26.16b, v16.16b, v16.16b,#8 //Karatsuba pre-processing + ext v28.16b, v17.16b, v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v26.16b eor v17.16b,v17.16b,v28.16b @@ -148,10 +152,10 @@ _gcm_init_v8: st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] //calculate H^7 and H^8 - pmull v0.1q,v22.1d,v26.1d - pmull v5.1q,v22.1d,v28.1d - pmull2 v2.1q,v22.2d,v26.2d - pmull2 v7.1q,v22.2d,v28.2d + pmull2 v0.1q,v22.2d,v26.2d + pmull2 v5.1q,v22.2d,v28.2d + pmull v2.1q,v22.1d,v26.1d + pmull v7.1q,v22.1d,v28.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v17.1d,v18.1d @@ -179,11 +183,11 @@ _gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v29.16b,v0.16b,v18.16b //H^7 - eor v31.16b,v5.16b,v4.16b //H^8 + eor v16.16b,v0.16b,v18.16b //H^7 + eor v17.16b,v5.16b,v4.16b //H^8 - ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing - ext v17.16b,v31.16b,v31.16b,#8 + ext v29.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v31.16b,v17.16b,v17.16b,#8 eor v16.16b,v16.16b,v29.16b eor v17.16b,v17.16b,v31.16b ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed @@ -199,6 +203,7 @@ _gcm_gmult_v8: ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + ext v20.16b,v20.16b,v20.16b,#8 shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b @@ -258,8 +263,10 @@ _gcm_ghash_v8: //loaded twice, but last //copy is not processed ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 movi v19.16b,#0xe1 ld1 {v22.2d},[x1] + ext v22.16b,v22.16b,v22.16b,#8 csel x12,xzr,x12,eq //is it time to zero x12? ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi ld1 {v16.2d},[x2],#16 //load [rotated] I[0] @@ -373,8 +380,12 @@ gcm_ghash_v8_4x: Lgcm_ghash_v8_4x: ld1 {v0.2d},[x0] //load [rotated] Xi ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 + ext v22.16b,v22.16b,v22.16b,#8 movi v19.16b,#0xe1 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + ext v26.16b,v26.16b,v26.16b,#8 + ext v28.16b,v28.16b,v28.16b,#8 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 diff --git a/generated-src/ios-arm/crypto/fipsmodule/ghashv8-armx.S b/generated-src/ios-arm/crypto/fipsmodule/ghashv8-armx.S index 37ee28ea5a..f76cf65092 100644 --- a/generated-src/ios-arm/crypto/fipsmodule/ghashv8-armx.S +++ b/generated-src/ios-arm/crypto/fipsmodule/ghashv8-armx.S @@ -34,13 +34,14 @@ _gcm_init_v8: vand q8,q8,q9 vorr q3,q3,q10 @ H<<<=1 veor q12,q3,q8 @ twisted H + vext.8 q12, q12, q12, #8 vst1.64 {q12},[r0]! @ store Htable[0] - @ calculate H^2 + @ calculate H^2 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 +.byte 0xa9,0x0e,0xa9,0xf2 @ pmull2 q0,q12,q12 veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 +.byte 0xa8,0x4e,0xa8,0xf2 @ pmull q2,q12,q12 .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing @@ -56,12 +57,13 @@ _gcm_init_v8: vext.8 q10,q0,q0,#8 @ 2nd phase .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 veor q10,q10,q2 - veor q14,q0,q10 + veor q9,q0,q10 - vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + vext.8 q14,q9,q9,#8 @ Karatsuba pre-processing veor q9,q9,q14 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed - vst1.64 {q13,q14},[r0]! @ store Htable[1..2] + vst1.64 {q13},[r0]! @ store Htable[1..2] + vst1.64 {q14},[r0]! @ store Htable[1..2] bx lr .globl _gcm_gmult_v8 @@ -75,6 +77,7 @@ _gcm_gmult_v8: vld1.64 {q9},[r0] @ load Xi vmov.i8 q11,#0xe1 vld1.64 {q12,q13},[r1] @ load twisted H, ... + vext.8 q12,q12,q12,#8 vshl.u64 q11,q11,#57 #ifndef __ARMEB__ vrev64.8 q9,q9 @@ -135,8 +138,10 @@ _gcm_ghash_v8: @ loaded twice, but last @ copy is not processed vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 + vext.8 q12,q12,q12,#8 vmov.i8 q11,#0xe1 vld1.64 {q14},[r1] + vext.8 q14,q14,q14,#8 moveq r12,#0 @ is it time to zero r12? vext.8 q0,q0,q0,#8 @ rotate Xi vld1.64 {q8},[r2]! @ load [rotated] I[0] diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S b/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S index 6983125b8d..55a51263f9 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S @@ -312,17 +312,13 @@ aesv8_gcm_8x_enc_128: .L128_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -353,9 +349,7 @@ aesv8_gcm_8x_enc_128: eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h3l | h3h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 @@ -450,9 +444,7 @@ aesv8_gcm_8x_enc_128: aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low @@ -666,15 +658,11 @@ aesv8_gcm_8x_enc_128: .L128_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k rev64 v9.16b, v9.16b //GHASH block 8k+1 @@ -752,9 +740,7 @@ aesv8_gcm_8x_enc_128: pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v5.16b, v27.16b @@ -806,9 +792,7 @@ aesv8_gcm_8x_enc_128: ldp q27, q28, [x11, #64] //load rk4, rk5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 @@ -986,16 +970,12 @@ aesv8_gcm_8x_enc_128: mov v29.16b, v27.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 cmp x5, #112 b.gt .L128_enc_blocks_more_than_7 @@ -1162,7 +1142,6 @@ aesv8_gcm_8x_enc_128: st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block @@ -1199,7 +1178,6 @@ aesv8_gcm_8x_enc_128: ins v27.d[0], v8.d[1] //GHASH final-2 block - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid @@ -1219,7 +1197,6 @@ aesv8_gcm_8x_enc_128: st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -1281,7 +1258,6 @@ aesv8_gcm_8x_enc_128: eor v16.8b, v16.8b, v8.8b //GHASH final block - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid @@ -1619,9 +1595,7 @@ aesv8_gcm_8x_dec_128: .L128_dec_main_loop: //main loop start ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k @@ -1629,9 +1603,7 @@ aesv8_gcm_8x_dec_128: rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v5.16b, v30.16b //CTR block 8k+13 @@ -1697,9 +1669,7 @@ aesv8_gcm_8x_dec_128: pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -1727,9 +1697,7 @@ aesv8_gcm_8x_dec_128: aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v1.16b, v28.16b @@ -1977,17 +1945,13 @@ aesv8_gcm_8x_dec_128: ldp q26, q27, [x11, #0] //load rk0, rk1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v9.16b, v9.16b //GHASH block 8k+1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v13.16b, v13.16b //GHASH block 8k+5 rev64 v12.16b, v12.16b //GHASH block 8k+4 @@ -2060,9 +2024,7 @@ aesv8_gcm_8x_dec_128: pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -2075,9 +2037,7 @@ aesv8_gcm_8x_dec_128: aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v0.16b, v28.16b @@ -2283,16 +2243,12 @@ aesv8_gcm_8x_dec_128: cmp x5, #112 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result b.gt .L128_dec_blocks_more_than_7 @@ -2461,7 +2417,6 @@ aesv8_gcm_8x_dec_128: ins v27.d[0], v8.d[1] //GHASH final-3 block - mid ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldr q24, [x6, #64] //load h4k | h3k eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid @@ -2488,7 +2443,6 @@ aesv8_gcm_8x_dec_128: eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-2 block - mid @@ -2513,7 +2467,6 @@ aesv8_gcm_8x_dec_128: rev64 v8.16b, v9.16b //GHASH final-1 block ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -2560,7 +2513,6 @@ aesv8_gcm_8x_dec_128: mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits @@ -2963,16 +2915,12 @@ aesv8_gcm_8x_enc_192: rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -3074,9 +3022,7 @@ aesv8_gcm_8x_enc_192: aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid @@ -3122,9 +3068,7 @@ aesv8_gcm_8x_enc_192: aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b @@ -3351,9 +3295,7 @@ aesv8_gcm_8x_enc_192: add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -3365,9 +3307,7 @@ aesv8_gcm_8x_enc_192: rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v7.16b, v30.16b //CTR block 8k+15 @@ -3475,17 +3415,13 @@ aesv8_gcm_8x_enc_192: aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) @@ -3695,19 +3631,15 @@ aesv8_gcm_8x_enc_192: .L192_enc_tail: //TAIL ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 cmp x5, #112 .inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -3883,7 +3815,6 @@ aesv8_gcm_8x_enc_192: ldr q9, [x0], #16 //AES final-2 block - load plaintext ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ins v27.d[0], v8.d[1] //GHASH final-3 block - mid @@ -3906,7 +3837,6 @@ aesv8_gcm_8x_enc_192: rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -3929,7 +3859,6 @@ aesv8_gcm_8x_enc_192: .L192_enc_blocks_more_than_1: //blocks left > 1 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result rev64 v8.16b, v9.16b //GHASH final-1 block @@ -3975,7 +3904,6 @@ aesv8_gcm_8x_enc_192: mov v0.d[1], x14 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[0], x13 //ctr0b is mask for last block @@ -4380,9 +4308,7 @@ aesv8_gcm_8x_dec_192: add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v11.16b, v11.16b //GHASH block 8k+3 @@ -4420,9 +4346,7 @@ aesv8_gcm_8x_dec_192: aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 @@ -4477,9 +4401,7 @@ aesv8_gcm_8x_dec_192: aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v26.16b @@ -4538,9 +4460,7 @@ aesv8_gcm_8x_dec_192: aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v5.16b, v28.16b @@ -4765,9 +4685,7 @@ aesv8_gcm_8x_dec_192: add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -4780,9 +4698,7 @@ aesv8_gcm_8x_dec_192: rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v0.16b, v26.16b @@ -4889,16 +4805,12 @@ aesv8_gcm_8x_dec_192: aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 @@ -5111,17 +5023,13 @@ aesv8_gcm_8x_dec_192: sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -5283,7 +5191,6 @@ aesv8_gcm_8x_dec_192: .L192_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block ldr q9, [x0], #16 //AES final-2 block - load ciphertext @@ -5312,7 +5219,6 @@ aesv8_gcm_8x_dec_192: rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -5339,7 +5245,6 @@ aesv8_gcm_8x_dec_192: rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load ciphertext ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in @@ -5383,7 +5288,6 @@ aesv8_gcm_8x_dec_192: csel x13, x8, x7, lt csel x14, x7, xzr, lt ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 mov v0.d[1], x14 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored @@ -5825,9 +5729,7 @@ aesv8_gcm_8x_enc_256: rev64 v11.16b, v11.16b //GHASH block 8k+3 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -5837,9 +5739,7 @@ aesv8_gcm_8x_enc_256: rev64 v12.16b, v12.16b //GHASH block 8k+4 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 @@ -5940,9 +5840,7 @@ aesv8_gcm_8x_enc_256: pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high @@ -6026,9 +5924,7 @@ aesv8_gcm_8x_enc_256: aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid @@ -6290,16 +6186,12 @@ aesv8_gcm_8x_enc_256: aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v5.16b, v27.16b @@ -6415,9 +6307,7 @@ aesv8_gcm_8x_enc_256: ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -6459,9 +6349,7 @@ aesv8_gcm_8x_enc_256: pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v1.16b, v27.16b @@ -6635,18 +6523,14 @@ aesv8_gcm_8x_enc_256: .L256_enc_tail: //TAIL ldp q24, q25, [x6, #160] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - load plaintext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 mov v29.16b, v28.16b cmp x5, #112 @@ -6816,7 +6700,6 @@ aesv8_gcm_8x_enc_256: st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -6842,7 +6725,6 @@ aesv8_gcm_8x_enc_256: .L256_enc_blocks_more_than_2: //blocks left > 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-2 block - store result @@ -6872,7 +6754,6 @@ aesv8_gcm_8x_enc_256: st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -6916,7 +6797,6 @@ aesv8_gcm_8x_enc_256: mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[1], x14 @@ -7355,9 +7235,7 @@ aesv8_gcm_8x_dec_256: rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -7392,9 +7270,7 @@ aesv8_gcm_8x_dec_256: eor v8.16b, v8.16b, v19.16b //PRE 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -7547,9 +7423,7 @@ aesv8_gcm_8x_dec_256: .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v14.16b, v14.16b //GHASH block 8k+6 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid @@ -7560,9 +7434,7 @@ aesv8_gcm_8x_dec_256: ldp q28, q26, [x11, #128] //load rk8, rk9 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 @@ -7790,17 +7662,13 @@ aesv8_gcm_8x_dec_256: ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 @@ -7915,9 +7783,7 @@ aesv8_gcm_8x_dec_256: .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 @@ -7959,9 +7825,7 @@ aesv8_gcm_8x_dec_256: ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v15.16b, v15.16b //GHASH block 8k+7 rev64 v13.16b, v13.16b //GHASH block 8k+5 @@ -8173,16 +8037,12 @@ aesv8_gcm_8x_dec_256: ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v28.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 b.gt .L256_dec_blocks_more_than_7 mov v7.16b, v6.16b @@ -8341,7 +8201,6 @@ aesv8_gcm_8x_dec_256: .L256_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8371,7 +8230,6 @@ aesv8_gcm_8x_dec_256: rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q9, [x0], #16 //AES final-1 block - load ciphertext eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8399,7 +8257,6 @@ aesv8_gcm_8x_dec_256: ins v27.d[0], v8.d[1] //GHASH final-1 block - mid ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load ciphertext @@ -8447,7 +8304,6 @@ aesv8_gcm_8x_dec_256: and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing rev64 v8.16b, v9.16b //GHASH final block diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S b/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S index df1b7d5c5b..0993d113b2 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S @@ -83,7 +83,6 @@ aes_gcm_enc_kernel: aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b @@ -92,14 +91,12 @@ aes_gcm_enc_kernel: aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 @@ -142,7 +139,6 @@ aes_gcm_enc_kernel: aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 @@ -843,15 +839,12 @@ aes_gcm_dec_kernel: aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 @@ -871,7 +864,6 @@ aes_gcm_dec_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/ghashv8-armx.S b/generated-src/linux-aarch64/crypto/fipsmodule/ghashv8-armx.S index cf2f72b693..cab17deced 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/ghashv8-armx.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/ghashv8-armx.S @@ -30,13 +30,14 @@ gcm_init_v8: and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H + ext v20.16b, v20.16b, v20.16b, #8 st1 {v20.2d},[x0],#16 //store Htable[0] - //calculate H^2 + //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing - pmull v0.1q,v20.1d,v20.1d + pmull2 v0.1q,v20.2d,v20.2d eor v16.16b,v16.16b,v20.16b - pmull2 v2.1q,v20.2d,v20.2d + pmull v2.1q,v20.1d,v20.1d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing @@ -52,17 +53,19 @@ gcm_init_v8: ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b - eor v22.16b,v0.16b,v18.16b + eor v17.16b,v0.16b,v18.16b - ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + ext v22.16b,v17.16b,v17.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + st1 {v21.2d},[x0],#16 //store Htable[1..2] + st1 {v22.2d},[x0],#16 //store Htable[1..2] + //calculate H^3 and H^4 - pmull v0.1q,v20.1d, v22.1d - pmull v5.1q,v22.1d,v22.1d - pmull2 v2.1q,v20.2d, v22.2d - pmull2 v7.1q,v22.2d,v22.2d + pmull2 v0.1q,v20.2d, v22.2d + pmull2 v5.1q,v22.2d,v22.2d + pmull v2.1q,v20.1d, v22.1d + pmull v7.1q,v22.1d,v22.1d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d @@ -91,11 +94,11 @@ gcm_init_v8: eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v23.16b, v0.16b,v18.16b //H^3 - eor v25.16b,v5.16b,v4.16b //H^4 + eor v16.16b, v0.16b,v18.16b //H^3 + eor v17.16b, v5.16b,v4.16b //H^4 - ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing - ext v17.16b,v25.16b,v25.16b,#8 + ext v23.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v25.16b,v17.16b,v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v23.16b eor v17.16b,v17.16b,v25.16b @@ -104,10 +107,10 @@ gcm_init_v8: st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] //calculate H^5 and H^6 - pmull v0.1q,v22.1d, v23.1d - pmull v5.1q,v23.1d,v23.1d - pmull2 v2.1q,v22.2d, v23.2d - pmull2 v7.1q,v23.2d,v23.2d + pmull2 v0.1q,v22.2d, v23.2d + pmull2 v5.1q,v23.2d,v23.2d + pmull v2.1q,v22.1d, v23.1d + pmull v7.1q,v23.1d,v23.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v16.1d,v16.1d @@ -135,11 +138,12 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v26.16b,v0.16b,v18.16b //H^5 - eor v28.16b,v5.16b,v4.16b //H^6 - ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing - ext v17.16b,v28.16b,v28.16b,#8 + eor v16.16b,v0.16b,v18.16b //H^5 + eor v17.16b,v5.16b,v4.16b //H^6 + + ext v26.16b, v16.16b, v16.16b,#8 //Karatsuba pre-processing + ext v28.16b, v17.16b, v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v26.16b eor v17.16b,v17.16b,v28.16b @@ -148,10 +152,10 @@ gcm_init_v8: st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] //calculate H^7 and H^8 - pmull v0.1q,v22.1d,v26.1d - pmull v5.1q,v22.1d,v28.1d - pmull2 v2.1q,v22.2d,v26.2d - pmull2 v7.1q,v22.2d,v28.2d + pmull2 v0.1q,v22.2d,v26.2d + pmull2 v5.1q,v22.2d,v28.2d + pmull v2.1q,v22.1d,v26.1d + pmull v7.1q,v22.1d,v28.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v17.1d,v18.1d @@ -179,11 +183,11 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v29.16b,v0.16b,v18.16b //H^7 - eor v31.16b,v5.16b,v4.16b //H^8 + eor v16.16b,v0.16b,v18.16b //H^7 + eor v17.16b,v5.16b,v4.16b //H^8 - ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing - ext v17.16b,v31.16b,v31.16b,#8 + ext v29.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v31.16b,v17.16b,v17.16b,#8 eor v16.16b,v16.16b,v29.16b eor v17.16b,v17.16b,v31.16b ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed @@ -199,6 +203,7 @@ gcm_gmult_v8: ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + ext v20.16b,v20.16b,v20.16b,#8 shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b @@ -258,8 +263,10 @@ gcm_ghash_v8: //loaded twice, but last //copy is not processed ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 movi v19.16b,#0xe1 ld1 {v22.2d},[x1] + ext v22.16b,v22.16b,v22.16b,#8 csel x12,xzr,x12,eq //is it time to zero x12? ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi ld1 {v16.2d},[x2],#16 //load [rotated] I[0] @@ -373,8 +380,12 @@ gcm_ghash_v8_4x: .Lgcm_ghash_v8_4x: ld1 {v0.2d},[x0] //load [rotated] Xi ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 + ext v22.16b,v22.16b,v22.16b,#8 movi v19.16b,#0xe1 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + ext v26.16b,v26.16b,v26.16b,#8 + ext v28.16b,v28.16b,v28.16b,#8 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 diff --git a/generated-src/linux-arm/crypto/fipsmodule/ghashv8-armx.S b/generated-src/linux-arm/crypto/fipsmodule/ghashv8-armx.S index fab4c124c2..470a2a934e 100644 --- a/generated-src/linux-arm/crypto/fipsmodule/ghashv8-armx.S +++ b/generated-src/linux-arm/crypto/fipsmodule/ghashv8-armx.S @@ -32,13 +32,14 @@ gcm_init_v8: vand q8,q8,q9 vorr q3,q3,q10 @ H<<<=1 veor q12,q3,q8 @ twisted H + vext.8 q12, q12, q12, #8 vst1.64 {q12},[r0]! @ store Htable[0] - @ calculate H^2 + @ calculate H^2 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 +.byte 0xa9,0x0e,0xa9,0xf2 @ pmull2 q0,q12,q12 veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 +.byte 0xa8,0x4e,0xa8,0xf2 @ pmull q2,q12,q12 .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing @@ -54,12 +55,13 @@ gcm_init_v8: vext.8 q10,q0,q0,#8 @ 2nd phase .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 veor q10,q10,q2 - veor q14,q0,q10 + veor q9,q0,q10 - vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + vext.8 q14,q9,q9,#8 @ Karatsuba pre-processing veor q9,q9,q14 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed - vst1.64 {q13,q14},[r0]! @ store Htable[1..2] + vst1.64 {q13},[r0]! @ store Htable[1..2] + vst1.64 {q14},[r0]! @ store Htable[1..2] bx lr .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 @@ -71,6 +73,7 @@ gcm_gmult_v8: vld1.64 {q9},[r0] @ load Xi vmov.i8 q11,#0xe1 vld1.64 {q12,q13},[r1] @ load twisted H, ... + vext.8 q12,q12,q12,#8 vshl.u64 q11,q11,#57 #ifndef __ARMEB__ vrev64.8 q9,q9 @@ -129,8 +132,10 @@ gcm_ghash_v8: @ loaded twice, but last @ copy is not processed vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 + vext.8 q12,q12,q12,#8 vmov.i8 q11,#0xe1 vld1.64 {q14},[r1] + vext.8 q14,q14,q14,#8 moveq r12,#0 @ is it time to zero r12? vext.8 q0,q0,q0,#8 @ rotate Xi vld1.64 {q8},[r2]! @ load [rotated] I[0] diff --git a/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S b/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S index fc0e3ad2dd..9b8d59edfd 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S @@ -314,17 +314,13 @@ aesv8_gcm_8x_enc_128: L128_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -355,9 +351,7 @@ L128_enc_main_loop: //main loop start eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h3l | h3h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 @@ -452,9 +446,7 @@ L128_enc_main_loop: //main loop start aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low @@ -668,15 +660,11 @@ L128_enc_main_loop: //main loop start L128_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k rev64 v9.16b, v9.16b //GHASH block 8k+1 @@ -754,9 +742,7 @@ L128_enc_prepretail: //PREPRETAIL pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v5.16b, v27.16b @@ -808,9 +794,7 @@ L128_enc_prepretail: //PREPRETAIL ldp q27, q28, [x11, #64] //load rk4, rk5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 @@ -988,16 +972,12 @@ L128_enc_tail: //TAIL mov v29.16b, v27.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .long 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 cmp x5, #112 b.gt L128_enc_blocks_more_than_7 @@ -1164,7 +1144,6 @@ L128_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block @@ -1201,7 +1180,6 @@ L128_enc_blocks_more_than_2: //blocks left > 2 ins v27.d[0], v8.d[1] //GHASH final-2 block - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid @@ -1221,7 +1199,6 @@ L128_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -1283,7 +1260,6 @@ L128_enc_blocks_less_than_1: //blocks left <= 1 eor v16.8b, v16.8b, v8.8b //GHASH final block - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid @@ -1623,9 +1599,7 @@ aesv8_gcm_8x_dec_128: L128_dec_main_loop: //main loop start ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k @@ -1633,9 +1607,7 @@ L128_dec_main_loop: //main loop start rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v5.16b, v30.16b //CTR block 8k+13 @@ -1701,9 +1673,7 @@ L128_dec_main_loop: //main loop start pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -1731,9 +1701,7 @@ L128_dec_main_loop: //main loop start aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v1.16b, v28.16b @@ -1981,17 +1949,13 @@ L128_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #0] //load rk0, rk1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v9.16b, v9.16b //GHASH block 8k+1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v13.16b, v13.16b //GHASH block 8k+5 rev64 v12.16b, v12.16b //GHASH block 8k+4 @@ -2064,9 +2028,7 @@ L128_dec_prepretail: //PREPRETAIL pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -2079,9 +2041,7 @@ L128_dec_prepretail: //PREPRETAIL aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v0.16b, v28.16b @@ -2287,16 +2247,12 @@ L128_dec_tail: //TAIL cmp x5, #112 ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result b.gt L128_dec_blocks_more_than_7 @@ -2465,7 +2421,6 @@ L128_dec_blocks_more_than_3: //blocks left > 3 ins v27.d[0], v8.d[1] //GHASH final-3 block - mid ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ldr q24, [x6, #64] //load h4k | h3k eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid @@ -2492,7 +2447,6 @@ L128_dec_blocks_more_than_2: //blocks left > 2 eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-2 block - mid @@ -2517,7 +2471,6 @@ L128_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -2564,7 +2517,6 @@ L128_dec_blocks_less_than_1: //blocks left <= 1 mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits @@ -2969,16 +2921,12 @@ L192_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -3080,9 +3028,7 @@ L192_enc_main_loop: //main loop start aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid @@ -3128,9 +3074,7 @@ L192_enc_main_loop: //main loop start aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b @@ -3357,9 +3301,7 @@ L192_enc_prepretail: //PREPRETAIL add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -3371,9 +3313,7 @@ L192_enc_prepretail: //PREPRETAIL rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v7.16b, v30.16b //CTR block 8k+15 @@ -3481,17 +3421,13 @@ L192_enc_prepretail: //PREPRETAIL aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) @@ -3701,19 +3637,15 @@ L192_enc_prepretail: //PREPRETAIL L192_enc_tail: //TAIL ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 cmp x5, #112 .long 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -3889,7 +3821,6 @@ L192_enc_blocks_more_than_3: //blocks left > 3 ldr q9, [x0], #16 //AES final-2 block - load plaintext ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 ins v27.d[0], v8.d[1] //GHASH final-3 block - mid @@ -3912,7 +3843,6 @@ L192_enc_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -3935,7 +3865,6 @@ L192_enc_blocks_more_than_2: //blocks left > 2 L192_enc_blocks_more_than_1: //blocks left > 1 ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result rev64 v8.16b, v9.16b //GHASH final-1 block @@ -3981,7 +3910,6 @@ L192_enc_blocks_less_than_1: //blocks left <= 1 mov v0.d[1], x14 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[0], x13 //ctr0b is mask for last block @@ -4388,9 +4316,7 @@ L192_dec_main_loop: //main loop start add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v11.16b, v11.16b //GHASH block 8k+3 @@ -4428,9 +4354,7 @@ L192_dec_main_loop: //main loop start aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 @@ -4485,9 +4409,7 @@ L192_dec_main_loop: //main loop start aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v26.16b @@ -4546,9 +4468,7 @@ L192_dec_main_loop: //main loop start aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v5.16b, v28.16b @@ -4773,9 +4693,7 @@ L192_dec_prepretail: //PREPRETAIL add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 @@ -4788,9 +4706,7 @@ L192_dec_prepretail: //PREPRETAIL rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v0.16b, v26.16b @@ -4897,16 +4813,12 @@ L192_dec_prepretail: //PREPRETAIL aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 @@ -5119,17 +5031,13 @@ L192_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result @@ -5291,7 +5199,6 @@ L192_dec_blocks_more_than_4: //blocks left > 4 L192_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block ldr q9, [x0], #16 //AES final-2 block - load ciphertext @@ -5320,7 +5227,6 @@ L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -5347,7 +5253,6 @@ L192_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load ciphertext ldr q22, [x6, #32] //load h1l | h1h - ext v22.16b, v22.16b, v22.16b, #8 eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in @@ -5391,7 +5296,6 @@ L192_dec_blocks_less_than_1: //blocks left <= 1 csel x13, x8, x7, lt csel x14, x7, xzr, lt ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 mov v0.d[1], x14 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored @@ -5835,9 +5739,7 @@ L256_enc_main_loop: //main loop start rev64 v11.16b, v11.16b //GHASH block 8k+3 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 @@ -5847,9 +5749,7 @@ L256_enc_main_loop: //main loop start rev64 v12.16b, v12.16b //GHASH block 8k+4 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 @@ -5950,9 +5850,7 @@ L256_enc_main_loop: //main loop start pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid .long 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high @@ -6036,9 +5934,7 @@ L256_enc_main_loop: //main loop start aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid @@ -6300,16 +6196,12 @@ L256_enc_prepretail: //PREPRETAIL aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v5.16b, v27.16b @@ -6425,9 +6317,7 @@ L256_enc_prepretail: //PREPRETAIL ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid @@ -6469,9 +6359,7 @@ L256_enc_prepretail: //PREPRETAIL pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v1.16b, v27.16b @@ -6645,18 +6533,14 @@ L256_enc_prepretail: //PREPRETAIL L256_enc_tail: //TAIL ldp q24, q25, [x6, #160] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - load plaintext ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 mov v29.16b, v28.16b cmp x5, #112 @@ -6826,7 +6710,6 @@ L256_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -6852,7 +6735,6 @@ L256_enc_blocks_more_than_3: //blocks left > 3 L256_enc_blocks_more_than_2: //blocks left > 2 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 st1 { v9.16b}, [x2], #16 //AES final-2 block - store result @@ -6882,7 +6764,6 @@ L256_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext @@ -6926,7 +6807,6 @@ L256_enc_blocks_less_than_1: //blocks left <= 1 mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[1], x14 @@ -7367,9 +7247,7 @@ L256_dec_main_loop: //main loop start rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 @@ -7404,9 +7282,7 @@ L256_dec_main_loop: //main loop start eor v8.16b, v8.16b, v19.16b //PRE 1 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 @@ -7559,9 +7435,7 @@ L256_dec_main_loop: //main loop start .long 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v14.16b, v14.16b //GHASH block 8k+6 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid @@ -7572,9 +7446,7 @@ L256_dec_main_loop: //main loop start ldp q28, q26, [x11, #128] //load rk8, rk9 ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 .long 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 @@ -7802,17 +7674,13 @@ L256_dec_prepretail: //PREPRETAIL ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #176] //load h8l | h8h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 @@ -7927,9 +7795,7 @@ L256_dec_prepretail: //PREPRETAIL .long 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 @@ -7971,9 +7837,7 @@ L256_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v15.16b, v15.16b //GHASH block 8k+7 rev64 v13.16b, v13.16b //GHASH block 8k+5 @@ -8185,16 +8049,12 @@ L256_dec_tail: //TAIL ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k - ext v25.16b, v25.16b, v25.16b, #8 mov v29.16b, v28.16b ldp q20, q21, [x6, #96] //load h5l | h5h - ext v20.16b, v20.16b, v20.16b, #8 .long 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result ldp q22, q23, [x6, #128] //load h6l | h6h - ext v22.16b, v22.16b, v22.16b, #8 - ext v23.16b, v23.16b, v23.16b, #8 b.gt L256_dec_blocks_more_than_7 mov v7.16b, v6.16b @@ -8353,7 +8213,6 @@ L256_dec_blocks_more_than_4: //blocks left > 4 L256_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h - ext v25.16b, v25.16b, v25.16b, #8 rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8383,7 +8242,6 @@ L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h - ext v23.16b, v23.16b, v23.16b, #8 ldr q9, [x0], #16 //AES final-1 block - load ciphertext eor v8.16b, v8.16b, v16.16b //feed in partial tag @@ -8411,7 +8269,6 @@ L256_dec_blocks_more_than_1: //blocks left > 1 ins v27.d[0], v8.d[1] //GHASH final-1 block - mid ldr q22, [x6, #32] //load h2l | h2h - ext v22.16b, v22.16b, v22.16b, #8 eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load ciphertext @@ -8459,7 +8316,6 @@ L256_dec_blocks_less_than_1: //blocks left <= 1 and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits ldr q20, [x6] //load h1l | h1h - ext v20.16b, v20.16b, v20.16b, #8 bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing rev64 v8.16b, v9.16b //GHASH final block diff --git a/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S b/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S index 250323a687..a0b342a50a 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S @@ -85,7 +85,6 @@ aes_gcm_enc_kernel: aese v1.16b, v19.16b aesmc v1.16b, v1.16b // AES block 1 - round 1 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 aese v2.16b, v19.16b @@ -94,14 +93,12 @@ aes_gcm_enc_kernel: aese v1.16b, v20.16b aesmc v1.16b, v1.16b // AES block 1 - round 2 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v3.16b, v19.16b aesmc v3.16b, v3.16b // AES block 3 - round 1 ldr q30, [x8, #192] // load rk12 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v21.16b aesmc v1.16b, v1.16b // AES block 1 - round 3 ldr q29, [x8, #176] // load rk11 @@ -144,7 +141,6 @@ aes_gcm_enc_kernel: aese v0.16b, v24.16b aesmc v0.16b, v0.16b // AES block 0 - round 6 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b // AES block 2 - round 6 ldr q28, [x8, #160] // load rk10 @@ -847,15 +843,12 @@ aes_gcm_dec_kernel: aese v0.16b, v18.16b aesmc v0.16b, v0.16b // AES block 0 - round 0 ldr q14, [x6, #48] // load h3l | h3h - ext v14.16b, v14.16b, v14.16b, #8 aese v3.16b, v18.16b aesmc v3.16b, v3.16b // AES block 3 - round 0 ldr q15, [x6, #80] // load h4l | h4h - ext v15.16b, v15.16b, v15.16b, #8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b // AES block 1 - round 0 ldr q13, [x6, #32] // load h2l | h2h - ext v13.16b, v13.16b, v13.16b, #8 aese v2.16b, v18.16b aesmc v2.16b, v2.16b // AES block 2 - round 0 ldr q20, [x8, #32] // load rk2 @@ -875,7 +868,6 @@ aes_gcm_dec_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b // AES block 0 - round 2 ldr q12, [x6] // load h1l | h1h - ext v12.16b, v12.16b, v12.16b, #8 aese v2.16b, v20.16b aesmc v2.16b, v2.16b // AES block 2 - round 2 ldr q28, [x8, #160] // load rk10 diff --git a/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S b/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S index b352fa31a8..dd9b9dee27 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S @@ -32,13 +32,14 @@ gcm_init_v8: and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H + ext v20.16b, v20.16b, v20.16b, #8 st1 {v20.2d},[x0],#16 //store Htable[0] - //calculate H^2 + //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing - pmull v0.1q,v20.1d,v20.1d + pmull2 v0.1q,v20.2d,v20.2d eor v16.16b,v16.16b,v20.16b - pmull2 v2.1q,v20.2d,v20.2d + pmull v2.1q,v20.1d,v20.1d pmull v1.1q,v16.1d,v16.1d ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing @@ -54,17 +55,19 @@ gcm_init_v8: ext v18.16b,v0.16b,v0.16b,#8 //2nd phase pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b - eor v22.16b,v0.16b,v18.16b + eor v17.16b,v0.16b,v18.16b - ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + ext v22.16b,v17.16b,v17.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + st1 {v21.2d},[x0],#16 //store Htable[1..2] + st1 {v22.2d},[x0],#16 //store Htable[1..2] + //calculate H^3 and H^4 - pmull v0.1q,v20.1d, v22.1d - pmull v5.1q,v22.1d,v22.1d - pmull2 v2.1q,v20.2d, v22.2d - pmull2 v7.1q,v22.2d,v22.2d + pmull2 v0.1q,v20.2d, v22.2d + pmull2 v5.1q,v22.2d,v22.2d + pmull v2.1q,v20.1d, v22.1d + pmull v7.1q,v22.1d,v22.1d pmull v1.1q,v16.1d,v17.1d pmull v6.1q,v17.1d,v17.1d @@ -93,11 +96,11 @@ gcm_init_v8: eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v23.16b, v0.16b,v18.16b //H^3 - eor v25.16b,v5.16b,v4.16b //H^4 + eor v16.16b, v0.16b,v18.16b //H^3 + eor v17.16b, v5.16b,v4.16b //H^4 - ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing - ext v17.16b,v25.16b,v25.16b,#8 + ext v23.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v25.16b,v17.16b,v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v23.16b eor v17.16b,v17.16b,v25.16b @@ -106,10 +109,10 @@ gcm_init_v8: st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] //calculate H^5 and H^6 - pmull v0.1q,v22.1d, v23.1d - pmull v5.1q,v23.1d,v23.1d - pmull2 v2.1q,v22.2d, v23.2d - pmull2 v7.1q,v23.2d,v23.2d + pmull2 v0.1q,v22.2d, v23.2d + pmull2 v5.1q,v23.2d,v23.2d + pmull v2.1q,v22.1d, v23.1d + pmull v7.1q,v23.1d,v23.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v16.1d,v16.1d @@ -137,11 +140,12 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v26.16b,v0.16b,v18.16b //H^5 - eor v28.16b,v5.16b,v4.16b //H^6 - ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing - ext v17.16b,v28.16b,v28.16b,#8 + eor v16.16b,v0.16b,v18.16b //H^5 + eor v17.16b,v5.16b,v4.16b //H^6 + + ext v26.16b, v16.16b, v16.16b,#8 //Karatsuba pre-processing + ext v28.16b, v17.16b, v17.16b,#8 ext v18.16b,v22.16b,v22.16b,#8 eor v16.16b,v16.16b,v26.16b eor v17.16b,v17.16b,v28.16b @@ -150,10 +154,10 @@ gcm_init_v8: st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] //calculate H^7 and H^8 - pmull v0.1q,v22.1d,v26.1d - pmull v5.1q,v22.1d,v28.1d - pmull2 v2.1q,v22.2d,v26.2d - pmull2 v7.1q,v22.2d,v28.2d + pmull2 v0.1q,v22.2d,v26.2d + pmull2 v5.1q,v22.2d,v28.2d + pmull v2.1q,v22.1d,v26.1d + pmull v7.1q,v22.1d,v28.1d pmull v1.1q,v16.1d,v18.1d pmull v6.1q,v17.1d,v18.1d @@ -181,11 +185,11 @@ gcm_init_v8: pmull v5.1q,v5.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v4.16b,v4.16b,v7.16b - eor v29.16b,v0.16b,v18.16b //H^7 - eor v31.16b,v5.16b,v4.16b //H^8 + eor v16.16b,v0.16b,v18.16b //H^7 + eor v17.16b,v5.16b,v4.16b //H^8 - ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing - ext v17.16b,v31.16b,v31.16b,#8 + ext v29.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing + ext v31.16b,v17.16b,v17.16b,#8 eor v16.16b,v16.16b,v29.16b eor v17.16b,v17.16b,v31.16b ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed @@ -203,6 +207,7 @@ gcm_gmult_v8: ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + ext v20.16b,v20.16b,v20.16b,#8 shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b @@ -264,8 +269,10 @@ gcm_ghash_v8: //loaded twice, but last //copy is not processed ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 movi v19.16b,#0xe1 ld1 {v22.2d},[x1] + ext v22.16b,v22.16b,v22.16b,#8 csel x12,xzr,x12,eq //is it time to zero x12? ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi ld1 {v16.2d},[x2],#16 //load [rotated] I[0] @@ -381,8 +388,12 @@ gcm_ghash_v8_4x: Lgcm_ghash_v8_4x: ld1 {v0.2d},[x0] //load [rotated] Xi ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + ext v20.16b,v20.16b,v20.16b,#8 + ext v22.16b,v22.16b,v22.16b,#8 movi v19.16b,#0xe1 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + ext v26.16b,v26.16b,v26.16b,#8 + ext v28.16b,v28.16b,v28.16b,#8 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64