diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index 4efc43df28..6cae97dc96 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -160,46 +160,6 @@ L(copy96): .p2align 6 L(copy_long): - /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. - This loop is identical to the one below it but using ldnp/stnp - instructions. For loops that are less than 32768 bytes, - the ldnp/stnp instructions will not help and will cause a slow - down so only use the ldnp/stnp loop for the largest sizes. */ - - cmp count, #32768 - b.lo L(copy_long_without_nontemp) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldnp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldnp A_l, A_h, [src, 16] - stnp D_l, D_h, [dstin] - ldnp B_l, B_h, [src, 32] - ldnp C_l, C_h, [src, 48] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(nontemp_loop64): - tbz src, #6, 1f -1: - stnp A_l, A_h, [dst, 16] - ldnp A_l, A_h, [src, 16] - stnp B_l, B_h, [dst, 32] - ldnp B_l, B_h, [src, 32] - stnp C_l, C_h, [dst, 48] - ldnp C_l, C_h, [src, 48] - stnp D_l, D_h, [dst, 64] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - add dst, dst, #64 - subs count, count, 64 - b.hi L(nontemp_loop64) - b L(last64) - -L(copy_long_without_nontemp): - and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src]