From 8d0d405eedb9e8f9d0d381a5e85235684df0a868 Mon Sep 17 00:00:00 2001 From: Jayanth Krishnamurthy Date: Mon, 19 Feb 2024 13:17:46 -0600 Subject: [PATCH] x/crypto/chacha20: cleanup chacha_ppc64le.s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adding PCALIGN before the loops - Changing WORD directive with corresponding Vector Merge EVEN/ODD word instructions - Replacing Branch Conditional (BC) with its extended mnemonic form BDNZ - VPERMXOR instruction usage in place of VXOR instructions followed by VRLW (rotate left) for cases of rotating in multiples of 8. This replacements give performace improvement both in time and space of around 7%-8% as listed below using benchstat tool. goos: linux goarch: ppc64le pkg: golang.org/x/crypto/chacha20 cpu: POWER10 | chacha20.prev.out | chacha20.new.out | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 156.6n ± 1% -8.90% (p=0.002 n=6) ChaCha20/256 165.5n ± 0% 152.4n ± 0% -7.92% (p=0.002 n=6) ChaCha20/10x25 505.8n ± 0% 504.3n ± 2% -0.32% (p=0.589 n=6) ChaCha20/4096 2.265µ ± 0% 2.052µ ± 0% -9.40% (p=0.002 n=6) ChaCha20/100x40 5.359µ ± 3% 5.018µ ± 2% -6.37% (p=0.002 n=6) ChaCha20/65536 35.71µ ± 0% 32.29µ ± 0% -9.57% (p=0.002 n=6) ChaCha20/1000x65 44.63µ ± 0% 41.05µ ± 0% -8.02% (p=0.002 n=6) geomean 2.235µ 2.073µ -7.26% | chacha20.prev.out | chacha20.new.out | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 389.8Mi ± 1% +9.78% (p=0.002 n=6) ChaCha20/256 1.440Gi ± 0% 1.565Gi ± 0% +8.62% (p=0.002 n=6) ChaCha20/10x25 471.3Mi ± 0% 472.8Mi ± 2% +0.31% (p=0.589 n=6) ChaCha20/4096 1.684Gi ± 0% 1.859Gi ± 0% +10.38% (p=0.002 n=6) ChaCha20/100x40 711.8Mi ± 3% 760.3Mi ± 2% +6.80% (p=0.002 n=6) ChaCha20/65536 1.709Gi ± 0% 1.890Gi ± 0% +10.59% (p=0.002 n=6) ChaCha20/1000x65 1.356Gi ± 0% 1.475Gi ± 0% +8.72% (p=0.002 n=6) geomean 957.3Mi 1.008Gi +7.83% Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797 Reviewed-by: Lynn Boger Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Run-TryBot: Lynn Boger Reviewed-by: Cherry Mui --- chacha20/chacha_ppc64le.s | 110 ++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 58 deletions(-) diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s index 66aebae258..c672ccf698 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64le.s @@ -33,6 +33,9 @@ #define CONSTBASE R16 #define BLOCKS R17 +// for VPERMXOR +#define MASK R18 + DATA consts<>+0x00(SB)/8, $0x3320646e61707865 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 DATA consts<>+0x10(SB)/8, $0x0000000000000001 @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 DATA consts<>+0x90(SB)/8, $0x0000000100000000 DATA consts<>+0x98(SB)/8, $0x0000000300000002 -GLOBL consts<>(SB), RODATA, $0xa0 +DATA consts<>+0xa0(SB)/8, $0x5566774411223300 +DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 +DATA consts<>+0xb0(SB)/8, $0x6677445522330011 +DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +GLOBL consts<>(SB), RODATA, $0xc0 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $48, R10 MOVD $64, R11 SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 // V16 LXVW4X (CONSTBASE)(R0), VS48 ADD $80,CONSTBASE @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // V28 LXVW4X (CONSTBASE)(R11), VS60 + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + // splat slot from V19 -> V26 VSPLTW $0, V19, V26 @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $10, R14 MOVD R14, CTR - + PCALIGN $16 loop_outer_vsx: // V0, V1, V2, V3 LXVW4X (R0)(CONSTBASE), VS32 @@ -128,22 +142,17 @@ loop_outer_vsx: VSPLTISW $12, V28 VSPLTISW $8, V29 VSPLTISW $7, V30 - + PCALIGN $16 loop_vsx: VADDUWM V0, V4, V0 VADDUWM V1, V5, V1 VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 - VRLW V15, V27, V15 + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -165,15 +174,10 @@ loop_vsx: VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 - VRLW V15, V29, V15 + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -195,15 +199,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V27, V15 - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -225,15 +224,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V29, V15 - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -249,48 +243,48 @@ loop_vsx: VRLW V6, V30, V6 VRLW V7, V30, V7 VRLW V4, V30, V4 - BC 16, LT, loop_vsx + BDNZ loop_vsx VADDUWM V12, V26, V12 - WORD $0x13600F8C // VMRGEW V0, V1, V27 - WORD $0x13821F8C // VMRGEW V2, V3, V28 + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 - WORD $0x10000E8C // VMRGOW V0, V1, V0 - WORD $0x10421E8C // VMRGOW V2, V3, V2 + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 - WORD $0x13A42F8C // VMRGEW V4, V5, V29 - WORD $0x13C63F8C // VMRGEW V6, V7, V30 + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 XXPERMDI VS32, VS34, $0, VS33 XXPERMDI VS32, VS34, $3, VS35 XXPERMDI VS59, VS60, $0, VS32 XXPERMDI VS59, VS60, $3, VS34 - WORD $0x10842E8C // VMRGOW V4, V5, V4 - WORD $0x10C63E8C // VMRGOW V6, V7, V6 + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 - WORD $0x13684F8C // VMRGEW V8, V9, V27 - WORD $0x138A5F8C // VMRGEW V10, V11, V28 + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 XXPERMDI VS36, VS38, $0, VS37 XXPERMDI VS36, VS38, $3, VS39 XXPERMDI VS61, VS62, $0, VS36 XXPERMDI VS61, VS62, $3, VS38 - WORD $0x11084E8C // VMRGOW V8, V9, V8 - WORD $0x114A5E8C // VMRGOW V10, V11, V10 + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 - WORD $0x13AC6F8C // VMRGEW V12, V13, V29 - WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 XXPERMDI VS40, VS42, $0, VS41 XXPERMDI VS40, VS42, $3, VS43 XXPERMDI VS59, VS60, $0, VS40 XXPERMDI VS59, VS60, $3, VS42 - WORD $0x118C6E8C // VMRGOW V12, V13, V12 - WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 VSPLTISW $4, V27 VADDUWM V26, V27, V26 @@ -431,7 +425,7 @@ tail_vsx: ADD $-1, R11, R12 ADD $-1, INP ADD $-1, OUT - + PCALIGN $16 looptail_vsx: // Copying the result to OUT // in bytes. @@ -439,7 +433,7 @@ looptail_vsx: MOVBZU 1(INP), TMP XOR KEY, TMP, KEY MOVBU KEY, 1(OUT) - BC 16, LT, looptail_vsx + BDNZ looptail_vsx // Clear the stack values STXVW4X VS48, (R11)(R0)