From eb5eeb47403e0a91de834868e501b4d62b8d2cb9 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Thu, 14 Nov 2024 19:03:19 -0800 Subject: [PATCH] aarch64: Remove non-temporal load/stores from oryon-1's memcpy The hardware architects have a new recommendation not to use non-temporal load/stores for memcpy. This patch removes this path. I found there was no difference in the memcpy speed with/without non-temporal load/stores either. Signed-off-by: Andrew Pinski Reviewed-by: Adhemerval Zanella --- sysdeps/aarch64/multiarch/memcpy_oryon1.S | 40 ----------------------- 1 file changed, 40 deletions(-) diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index 4efc43df28..6cae97dc96 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -160,46 +160,6 @@ L(copy96): .p2align 6 L(copy_long): - /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. - This loop is identical to the one below it but using ldnp/stnp - instructions. For loops that are less than 32768 bytes, - the ldnp/stnp instructions will not help and will cause a slow - down so only use the ldnp/stnp loop for the largest sizes. */ - - cmp count, #32768 - b.lo L(copy_long_without_nontemp) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldnp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldnp A_l, A_h, [src, 16] - stnp D_l, D_h, [dstin] - ldnp B_l, B_h, [src, 32] - ldnp C_l, C_h, [src, 48] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(nontemp_loop64): - tbz src, #6, 1f -1: - stnp A_l, A_h, [dst, 16] - ldnp A_l, A_h, [src, 16] - stnp B_l, B_h, [dst, 32] - ldnp B_l, B_h, [src, 32] - stnp C_l, C_h, [dst, 48] - ldnp C_l, C_h, [src, 48] - stnp D_l, D_h, [dst, 64] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - add dst, dst, #64 - subs count, count, 64 - b.hi L(nontemp_loop64) - b L(last64) - -L(copy_long_without_nontemp): - and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src]