crypto: arm64/aes-modes - get rid of literal load of addend vector

Replace the literal load of the addend vector with a sequence that performs each add individually. This sequence is only 2 instructions longer than the original, and 2% faster on Cortex-A53. This is an improvement by itself, but also works around a Clang issue, whose integrated assembler does not implement the GNU ARM asm syntax completely, and does not support the =literal notation for FP registers (more info at https://bugs.llvm.org/show_bug.cgi?id=38642) Cc: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2025-03-06 20:59:54 +01:00 · 2018-08-23 17:48:45 +01:00 · 2018-08-23 17:48:45 +01:00 · ed6ed11830
commit ed6ed11830
parent 00227e3a1d
1 changed files with 9 additions and 7 deletions
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@ -232,17 +232,19 @@ AES_ENTRY(aes_ctr_encrypt)
 	bmi		.Lctr1x
 	cmn		w6, #4			/* 32 bit overflow? */
 	bcs		.Lctr1x
-	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
+	add		w7, w6, #1
 	dup		v7.4s, w6
 	mov		v0.16b, v4.16b
-	add		v7.4s, v7.4s, v8.4s
+	add		w8, w6, #2
 	mov		v1.16b, v4.16b
-	rev32		v8.16b, v7.16b
+	add		w9, w6, #3
 	mov		v2.16b, v4.16b
 	rev		w7, w7
 	mov		v3.16b, v4.16b
-	mov		v1.s[3], v8.s[0]
+	rev		w8, w8
-	mov		v2.s[3], v8.s[1]
+	mov		v1.s[3], w7
-	mov		v3.s[3], v8.s[2]
+	rev		w9, w9
 	mov		v2.s[3], w8
 	mov		v3.s[3], w9
 	ld1		{v5.16b-v7.16b}, [x20], #48	/* get 3 input blocks */
 	bl		aes_encrypt_block4x
 	eor		v0.16b, v5.16b, v0.16b