diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile index 7b87bc9055..216886c551 100644 --- a/sysdeps/loongarch/lp64/multiarch/Makefile +++ b/sysdeps/loongarch/lp64/multiarch/Makefile @@ -30,5 +30,9 @@ sysdep_routines += \ memrchr-generic \ memrchr-lsx \ memrchr-lasx \ + memset-aligned \ + memset-unaligned \ + memset-lsx \ + memset-lasx \ # sysdep_routines endif diff --git a/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h new file mode 100644 index 0000000000..e2723873bc --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h @@ -0,0 +1,24 @@ +/* Symbol rediretion for loader/static initialization code. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _DL_IFUNC_GENERIC_H +#define _DL_IFUNC_GENERIC_H + +asm ("memset = __memset_aligned"); + +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c index 8bd5489ee2..37f60dde91 100644 --- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c @@ -117,5 +117,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic) ) + + IFUNC_IMPL (i, name, memset, +#if !defined __loongarch_soft_float + IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx) + IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx) +#endif + IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned) + ) + return i; } diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S new file mode 100644 index 0000000000..1fce95b714 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S @@ -0,0 +1,174 @@ +/* Optimized memset aligned implementation using basic LoongArch instructions. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) +# define MEMSET_NAME __memset_aligned +#else +# define MEMSET_NAME memset +#endif + +LEAF(MEMSET_NAME, 6) + move t0, a0 + andi a3, a0, 0x7 + li.w t6, 16 + beqz a3, L(align) + bltu a2, t6, L(short_data) + +L(make_align): + li.w t8, 8 + sub.d t2, t8, a3 + pcaddi t1, 11 + slli.d t3, t2, 2 + sub.d t1, t1, t3 + jr t1 + +L(al7): + st.b a1, t0, 6 +L(al6): + st.b a1, t0, 5 +L(al5): + st.b a1, t0, 4 +L(al4): + st.b a1, t0, 3 +L(al3): + st.b a1, t0, 2 +L(al2): + st.b a1, t0, 1 +L(al1): + st.b a1, t0, 0 +L(al0): + add.d t0, t0, t2 + sub.d a2, a2, t2 + +L(align): + bstrins.d a1, a1, 15, 8 + bstrins.d a1, a1, 31, 16 + bstrins.d a1, a1, 63, 32 + bltu a2, t6, L(less_16bytes) + + andi a4, a2, 0x3f + beq a4, a2, L(less_64bytes) + + sub.d t1, a2, a4 + move a2, a4 + add.d a5, t0, t1 + +L(loop_64bytes): + addi.d t0, t0, 64 + st.d a1, t0, -64 + st.d a1, t0, -56 + st.d a1, t0, -48 + st.d a1, t0, -40 + + st.d a1, t0, -32 + st.d a1, t0, -24 + st.d a1, t0, -16 + st.d a1, t0, -8 + bne t0, a5, L(loop_64bytes) + +L(less_64bytes): + srai.d a4, a2, 5 + beqz a4, L(less_32bytes) + addi.d a2, a2, -32 + st.d a1, t0, 0 + + st.d a1, t0, 8 + st.d a1, t0, 16 + st.d a1, t0, 24 + addi.d t0, t0, 32 + +L(less_32bytes): + bltu a2, t6, L(less_16bytes) + addi.d a2, a2, -16 + st.d a1, t0, 0 + st.d a1, t0, 8 + addi.d t0, t0, 16 + +L(less_16bytes): + srai.d a4, a2, 3 + beqz a4, L(less_8bytes) + addi.d a2, a2, -8 + st.d a1, t0, 0 + addi.d t0, t0, 8 + +L(less_8bytes): + beqz a2, L(less_1byte) + srai.d a4, a2, 2 + beqz a4, L(less_4bytes) + addi.d a2, a2, -4 + st.w a1, t0, 0 + addi.d t0, t0, 4 + +L(less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(less_2bytes) + addi.d a2, a2, -2 + st.h a1, t0, 0 + addi.d t0, t0, 2 + +L(less_2bytes): + beqz a2, L(less_1byte) + st.b a1, t0, 0 +L(less_1byte): + jr ra + +L(short_data): + pcaddi t1, 19 + slli.d t3, a2, 2 + sub.d t1, t1, t3 + jr t1 +L(short_15): + st.b a1, a0, 14 +L(short_14): + st.b a1, a0, 13 +L(short_13): + st.b a1, a0, 12 +L(short_12): + st.b a1, a0, 11 +L(short_11): + st.b a1, a0, 10 +L(short_10): + st.b a1, a0, 9 +L(short_9): + st.b a1, a0, 8 +L(short_8): + st.b a1, a0, 7 +L(short_7): + st.b a1, a0, 6 +L(short_6): + st.b a1, a0, 5 +L(short_5): + st.b a1, a0, 4 +L(short_4): + st.b a1, a0, 3 +L(short_3): + st.b a1, a0, 2 +L(short_2): + st.b a1, a0, 1 +L(short_1): + st.b a1, a0, 0 +L(short_0): + jr ra +END(MEMSET_NAME) + +libc_hidden_builtin_def (MEMSET_NAME) diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S new file mode 100644 index 0000000000..041abbac87 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S @@ -0,0 +1,142 @@ +/* Optimized memset implementation using LoongArch LASX instructions. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) && !defined __loongarch_soft_float + +# define MEMSET __memset_lasx + +LEAF(MEMSET, 6) + li.d t1, 32 + move a3, a0 + xvreplgr2vr.b xr0, a1 + add.d a4, a0, a2 + + bgeu t1, a2, L(less_32bytes) + li.d t3, 128 + li.d t2, 64 + blt t3, a2, L(long_bytes) + +L(less_128bytes): + bgeu t2, a2, L(less_64bytes) + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a4, -32 + + xvst xr0, a4, -64 + jr ra +L(less_64bytes): + xvst xr0, a3, 0 + xvst xr0, a4, -32 + + + jr ra +L(less_32bytes): + srli.d t0, a2, 4 + beqz t0, L(less_16bytes) + vst vr0, a3, 0 + + vst vr0, a4, -16 + jr ra +L(less_16bytes): + srli.d t0, a2, 3 + beqz t0, L(less_8bytes) + + vstelm.d vr0, a3, 0, 0 + vstelm.d vr0, a4, -8, 0 + jr ra +L(less_8bytes): + srli.d t0, a2, 2 + + beqz t0, L(less_4bytes) + vstelm.w vr0, a3, 0, 0 + vstelm.w vr0, a4, -4, 0 + jr ra + + +L(less_4bytes): + srli.d t0, a2, 1 + beqz t0, L(less_2bytes) + vstelm.h vr0, a3, 0, 0 + vstelm.h vr0, a4, -2, 0 + + jr ra +L(less_2bytes): + beqz a2, L(less_1bytes) + st.b a1, a3, 0 +L(less_1bytes): + jr ra + +L(long_bytes): + xvst xr0, a3, 0 + bstrins.d a3, zero, 4, 0 + addi.d a3, a3, 32 + sub.d a2, a4, a3 + + andi t0, a2, 0xff + beq t0, a2, L(long_end) + move a2, t0 + sub.d t0, a4, t0 + + +L(loop_256): + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a3, 64 + xvst xr0, a3, 96 + + xvst xr0, a3, 128 + xvst xr0, a3, 160 + xvst xr0, a3, 192 + xvst xr0, a3, 224 + + addi.d a3, a3, 256 + bne a3, t0, L(loop_256) +L(long_end): + bltu a2, t3, L(end_less_128) + addi.d a2, a2, -128 + + xvst xr0, a3, 0 + xvst xr0, a3, 32 + xvst xr0, a3, 64 + xvst xr0, a3, 96 + + + addi.d a3, a3, 128 +L(end_less_128): + bltu a2, t2, L(end_less_64) + addi.d a2, a2, -64 + xvst xr0, a3, 0 + + xvst xr0, a3, 32 + addi.d a3, a3, 64 +L(end_less_64): + bltu a2, t1, L(end_less_32) + xvst xr0, a3, 0 + +L(end_less_32): + xvst xr0, a4, -32 + jr ra +END(MEMSET) + +libc_hidden_builtin_def (MEMSET) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S new file mode 100644 index 0000000000..3d3982aa5a --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S @@ -0,0 +1,135 @@ +/* Optimized memset implementation using LoongArch LSX instructions. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) && !defined __loongarch_soft_float + +# define MEMSET __memset_lsx + +LEAF(MEMSET, 6) + li.d t1, 16 + move a3, a0 + vreplgr2vr.b vr0, a1 + add.d a4, a0, a2 + + bgeu t1, a2, L(less_16bytes) + li.d t3, 64 + li.d t2, 32 + bgeu a2, t3, L(long_bytes) + +L(less_64bytes): + bgeu t2, a2, L(less_32bytes) + vst vr0, a3, 0 + vst vr0, a3, 16 + vst vr0, a4, -32 + + vst vr0, a4, -16 + jr ra +L(less_32bytes): + vst vr0, a3, 0 + vst vr0, a4, -16 + + + jr ra +L(less_16bytes): + srli.d t0, a2, 3 + beqz t0, L(less_8bytes) + vstelm.d vr0, a3, 0, 0 + + vstelm.d vr0, a4, -8, 0 + jr ra +L(less_8bytes): + srli.d t0, a2, 2 + beqz t0, L(less_4bytes) + + vstelm.w vr0, a3, 0, 0 + vstelm.w vr0, a4, -4, 0 + jr ra +L(less_4bytes): + srli.d t0, a2, 1 + + beqz t0, L(less_2bytes) + vstelm.h vr0, a3, 0, 0 + vstelm.h vr0, a4, -2, 0 + jr ra + + +L(less_2bytes): + beqz a2, L(less_1bytes) + vstelm.b vr0, a3, 0, 0 +L(less_1bytes): + jr ra +L(long_bytes): + vst vr0, a3, 0 + + bstrins.d a3, zero, 3, 0 + addi.d a3, a3, 16 + sub.d a2, a4, a3 + andi t0, a2, 0x7f + + beq t0, a2, L(long_end) + move a2, t0 + sub.d t0, a4, t0 + +L(loop_128): + vst vr0, a3, 0 + + vst vr0, a3, 16 + vst vr0, a3, 32 + vst vr0, a3, 48 + vst vr0, a3, 64 + + + vst vr0, a3, 80 + vst vr0, a3, 96 + vst vr0, a3, 112 + addi.d a3, a3, 128 + + bne a3, t0, L(loop_128) +L(long_end): + bltu a2, t3, L(end_less_64) + addi.d a2, a2, -64 + vst vr0, a3, 0 + + vst vr0, a3, 16 + vst vr0, a3, 32 + vst vr0, a3, 48 + addi.d a3, a3, 64 + +L(end_less_64): + bltu a2, t2, L(end_less_32) + addi.d a2, a2, -32 + vst vr0, a3, 0 + vst vr0, a3, 16 + + addi.d a3, a3, 32 +L(end_less_32): + bltu a2, t1, L(end_less_16) + vst vr0, a3, 0 + +L(end_less_16): + vst vr0, a4, -16 + jr ra +END(MEMSET) + +libc_hidden_builtin_def (MEMSET) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S new file mode 100644 index 0000000000..f7d32039df --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S @@ -0,0 +1,162 @@ +/* Optimized memset unaligned implementation using basic LoongArch instructions. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) + +# define MEMSET_NAME __memset_unaligned + +#define ST_128(n) \ + st.d a1, a0, n; \ + st.d a1, a0, n+8 ; \ + st.d a1, a0, n+16 ; \ + st.d a1, a0, n+24 ; \ + st.d a1, a0, n+32 ; \ + st.d a1, a0, n+40 ; \ + st.d a1, a0, n+48 ; \ + st.d a1, a0, n+56 ; \ + st.d a1, a0, n+64 ; \ + st.d a1, a0, n+72 ; \ + st.d a1, a0, n+80 ; \ + st.d a1, a0, n+88 ; \ + st.d a1, a0, n+96 ; \ + st.d a1, a0, n+104; \ + st.d a1, a0, n+112; \ + st.d a1, a0, n+120; + +LEAF(MEMSET_NAME, 6) + bstrins.d a1, a1, 15, 8 + add.d t7, a0, a2 + bstrins.d a1, a1, 31, 16 + move t0, a0 + + bstrins.d a1, a1, 63, 32 + srai.d t8, a2, 4 + beqz t8, L(less_16bytes) + srai.d t8, a2, 6 + + bnez t8, L(more_64bytes) + srai.d t8, a2, 5 + beqz t8, L(less_32bytes) + + st.d a1, a0, 0 + st.d a1, a0, 8 + st.d a1, a0, 16 + st.d a1, a0, 24 + + st.d a1, t7, -32 + st.d a1, t7, -24 + st.d a1, t7, -16 + st.d a1, t7, -8 + + jr ra + +L(less_32bytes): + st.d a1, a0, 0 + st.d a1, a0, 8 + st.d a1, t7, -16 + st.d a1, t7, -8 + + jr ra + +L(less_16bytes): + srai.d t8, a2, 3 + beqz t8, L(less_8bytes) + st.d a1, a0, 0 + st.d a1, t7, -8 + + jr ra + +L(less_8bytes): + srai.d t8, a2, 2 + beqz t8, L(less_4bytes) + st.w a1, a0, 0 + st.w a1, t7, -4 + + jr ra + +L(less_4bytes): + srai.d t8, a2, 1 + beqz t8, L(less_2bytes) + st.h a1, a0, 0 + st.h a1, t7, -2 + + jr ra + +L(less_2bytes): + beqz a2, L(less_1bytes) + st.b a1, a0, 0 + + jr ra + +L(less_1bytes): + jr ra + +L(more_64bytes): + srli.d a0, a0, 3 + slli.d a0, a0, 3 + addi.d a0, a0, 0x8 + st.d a1, t0, 0 + + sub.d t2, t0, a0 + add.d a2, t2, a2 + addi.d a2, a2, -0x80 + blt a2, zero, L(end_unalign_proc) + +L(loop_less): + ST_128(0) + addi.d a0, a0, 0x80 + addi.d a2, a2, -0x80 + bge a2, zero, L(loop_less) + +L(end_unalign_proc): + addi.d a2, a2, 0x80 + pcaddi t1, 20 + andi t5, a2, 0x78 + srli.d t5, t5, 1 + + sub.d t1, t1, t5 + jr t1 + + st.d a1, a0, 112 + st.d a1, a0, 104 + st.d a1, a0, 96 + st.d a1, a0, 88 + st.d a1, a0, 80 + st.d a1, a0, 72 + st.d a1, a0, 64 + st.d a1, a0, 56 + st.d a1, a0, 48 + st.d a1, a0, 40 + st.d a1, a0, 32 + st.d a1, a0, 24 + st.d a1, a0, 16 + st.d a1, a0, 8 + st.d a1, a0, 0 + st.d a1, t7, -8 + + move a0, t0 + jr ra +END(MEMSET_NAME) + +libc_hidden_builtin_def (MEMSET_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c new file mode 100644 index 0000000000..3ff60d8ac7 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/memset.c @@ -0,0 +1,37 @@ +/* Multiple versions of memset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define memset __redirect_memset +# include <string.h> +# undef memset + +# define SYMBOL_NAME memset +# include "ifunc-lasx.h" + +libc_ifunc_redirected (__redirect_memset, memset, + IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (memset, __GI_memset, __redirect_memset) + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memset); +# endif + +#endif