diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index 31782b66f9..bc8d842238 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -1,6 +1,7 @@ /* Initialize CPU feature data. AArch64 version. This file is part of the GNU C Library. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -56,6 +57,11 @@ #define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F' \ && MIDR_PARTNUM(midr) == 0x001) +#define IS_ORYON1(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \ + && (MIDR_PARTNUM(midr) == 0x001 \ + || (MIDR_PARTNUM(midr) == 0x002 \ + && MIDR_VARIANT(midr) == 0))) + struct cpu_features { uint64_t midr_el1; diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index e4720b7468..ef5ea9ab8c 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -5,6 +5,7 @@ sysdep_routines += \ memcpy_a64fx \ memcpy_generic \ memcpy_mops \ + memcpy_oryon1 \ memcpy_sve \ memcpy_thunderx \ memcpy_thunderx2 \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index ecd0f87de6..65c56b9b41 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -1,5 +1,6 @@ /* Enumerate available IFUNC implementations of a function. AARCH64 version. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -35,6 +36,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1) IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx) @@ -44,6 +46,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1) IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx) diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index ce53567dab..15c954778b 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -1,5 +1,6 @@ /* Multiple versions of memcpy. AARCH64 version. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -34,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_oryon1 attribute_hidden; static inline __typeof (__redirect_memcpy) * select_memcpy_ifunc (void) @@ -50,6 +52,9 @@ select_memcpy_ifunc (void) return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; } + if (IS_ORYON1 (midr)) + return __memcpy_oryon1; + if (IS_THUNDERX (midr)) return __memcpy_thunderx; diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S new file mode 100644 index 0000000000..4efc43df28 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -0,0 +1,301 @@ +/* A oryon-1 core Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ + +ENTRY (__memmove_oryon1) + + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (__memmove_oryon1) + +ENTRY (__memcpy_oryon1) + + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 6 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 6 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 6 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 6 +L(copy_long): + + /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. + This loop is identical to the one below it but using ldnp/stnp + instructions. For loops that are less than 32768 bytes, + the ldnp/stnp instructions will not help and will cause a slow + down so only use the ldnp/stnp loop for the largest sizes. */ + + cmp count, #32768 + b.lo L(copy_long_without_nontemp) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldnp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldnp A_l, A_h, [src, 16] + stnp D_l, D_h, [dstin] + ldnp B_l, B_h, [src, 32] + ldnp C_l, C_h, [src, 48] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(nontemp_loop64): + tbz src, #6, 1f +1: + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [src, 16] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [src, 32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [src, 48] + stnp D_l, D_h, [dst, 64] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + add dst, dst, #64 + subs count, count, 64 + b.hi L(nontemp_loop64) + b L(last64) + +L(copy_long_without_nontemp): + + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 6 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (__memcpy_oryon1)