diff --git a/sysdeps/aarch64/fpu/advsimd_utils.h b/sysdeps/aarch64/fpu/advsimd_utils.h
deleted file mode 100644
index 8a0fcc0e06..0000000000
--- a/sysdeps/aarch64/fpu/advsimd_utils.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Helpers for Advanced SIMD vector math functions.
-
- Copyright (C) 2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- . */
-
-#include
-
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-
-#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
-#define V_NAME_D1(fun) _ZGVnN2v_##fun
-#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
-#define V_NAME_D2(fun) _ZGVnN2vv_##fun
-
-static __always_inline float32x4_t
-v_call_f32 (float (*f) (float), float32x4_t x)
-{
- return (float32x4_t){ f (x[0]), f (x[1]), f (x[2]), f (x[3]) };
-}
-
-static __always_inline float64x2_t
-v_call_f64 (double (*f) (double), float64x2_t x)
-{
- return (float64x2_t){ f (x[0]), f (x[1]) };
-}
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
index 40831e6b0d..e8f1d10540 100644
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
@@ -17,13 +17,83 @@
License along with the GNU C Library; if not, see
. */
-#include
+#include "v_math.h"
-#include "advsimd_utils.h"
-
-VPCS_ATTR
-float64x2_t
-V_NAME_D1 (cos) (float64x2_t x)
+static const struct data
{
- return v_call_f64 (cos, x);
+ float64x2_t poly[7];
+ float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .half_pi = V2 (0x1.921fb54442d18p+0),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+ .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f64 (x);
+ cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+ vreinterpretq_u64_f64 (d->range_val));
+ if (__glibc_unlikely (v_any_u64 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+ cmp = vcageq_f64 (d->range_val, x);
+ cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+ n = vsubq_f64 (n, v_f64 (0.5));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (__glibc_unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
diff --git a/sysdeps/aarch64/fpu/cos_sve.c b/sysdeps/aarch64/fpu/cos_sve.c
index 55501e5000..d7a9b134da 100644
--- a/sysdeps/aarch64/fpu/cos_sve.c
+++ b/sysdeps/aarch64/fpu/cos_sve.c
@@ -17,12 +17,76 @@
License along with the GNU C Library; if not, see
. */
-#include
+#include "sv_math.h"
-#include "sve_utils.h"
-
-svfloat64_t
-SV_NAME_D1 (cos) (svfloat64_t x, svbool_t pg)
+static const struct data
{
- return sv_call_f64 (cos, x, svdup_n_f64 (0), pg);
+ double inv_pio2, pio2_1, pio2_2, pio2_3, shift;
+} data = {
+ /* Polynomial coefficients are hardwired in FTMAD instructions. */
+ .inv_pio2 = 0x1.45f306dc9c882p-1,
+ .pio2_1 = 0x1.921fb50000000p+0,
+ .pio2_2 = 0x1.110b460000000p-26,
+ .pio2_3 = 0x1.1a62633145c07p-54,
+ /* Original shift used in AdvSIMD cos,
+ plus a contribution to set the bit #0 of q
+ as expected by trigonometric instructions. */
+ .shift = 0x1.8000000000001p52
+};
+
+#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t out_of_bounds)
+{
+ return sv_call_f64 (cos, x, y, out_of_bounds);
+}
+
+/* A fast SVE implementation of cos based on trigonometric
+ instructions (FTMAD, FTSSEL, FTSMUL).
+ Maximum measured error: 2.108 ULPs.
+ SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
+ want -0x1.fddd4c65c7f05p-3. */
+svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t r = svabs_f64_x (pg, x);
+ svbool_t out_of_bounds
+ = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal);
+
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svbool_t ptrue = svptrue_b64 ();
+ svfloat64_t invpio2_and_pio2_1 = svld1rq_f64 (ptrue, &d->inv_pio2);
+ svfloat64_t pio2_23 = svld1rq_f64 (ptrue, &d->pio2_2);
+
+ /* n = rint(|x|/(pi/2)). */
+ svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0);
+ svfloat64_t n = svsub_n_f64_x (pg, q, d->shift);
+
+ /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
+ r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1);
+ r = svmls_lane_f64 (r, n, pio2_23, 0);
+ r = svmls_lane_f64 (r, n, pio2_23, 1);
+
+ /* cos(r) poly approx. */
+ svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q));
+ svfloat64_t y = sv_f64 (0.0);
+ y = svtmad_f64 (y, r2, 7);
+ y = svtmad_f64 (y, r2, 6);
+ y = svtmad_f64 (y, r2, 5);
+ y = svtmad_f64 (y, r2, 4);
+ y = svtmad_f64 (y, r2, 3);
+ y = svtmad_f64 (y, r2, 2);
+ y = svtmad_f64 (y, r2, 1);
+ y = svtmad_f64 (y, r2, 0);
+
+ /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
+ svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q));
+ /* Apply factor. */
+ y = svmul_f64_x (pg, f, y);
+
+ if (__glibc_unlikely (svptest_any (pg, out_of_bounds)))
+ return special_case (x, y, out_of_bounds);
+ return y;
}
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
index 35bb81aead..f05dd2bcda 100644
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -17,13 +17,78 @@
License along with the GNU C Library; if not, see
. */
-#include
+#include "v_math.h"
-#include "advsimd_utils.h"
-
-VPCS_ATTR
-float32x4_t
-V_NAME_F1 (cos) (float32x4_t x)
+static const struct data
{
- return v_call_f32 (cosf, x);
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .half_pi = V4 (0x1.921fb6p0f),
+ .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, r3, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f32 (x);
+ cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+ vreinterpretq_u32_f32 (d->range_val));
+ if (__glibc_unlikely (v_any_u32 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+ cmp = vcageq_f32 (d->range_val, x);
+ cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+ n = vsubq_f32 (n, v_f32 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ r3 = vmulq_f32 (r2, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, y, r3);
+
+ if (__glibc_unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
diff --git a/sysdeps/aarch64/fpu/cosf_sve.c b/sysdeps/aarch64/fpu/cosf_sve.c
index 16c68f387b..577cbd864e 100644
--- a/sysdeps/aarch64/fpu/cosf_sve.c
+++ b/sysdeps/aarch64/fpu/cosf_sve.c
@@ -17,12 +17,74 @@
License along with the GNU C Library; if not, see
. */
-#include
+#include "sv_math.h"
-#include "sve_utils.h"
-
-svfloat32_t
-SV_NAME_F1 (cos) (svfloat32_t x, svbool_t pg)
+static const struct data
{
- return sv_call_f32 (cosf, x, svdup_n_f32 (0), pg);
+ float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift;
+} data = {
+ /* Polynomial coefficients are hard-wired in FTMAD instructions. */
+ .neg_pio2_1 = -0x1.921fb6p+0f,
+ .neg_pio2_2 = 0x1.777a5cp-25f,
+ .neg_pio2_3 = 0x1.ee59dap-50f,
+ .inv_pio2 = 0x1.45f306p-1f,
+ /* Original shift used in AdvSIMD cosf,
+ plus a contribution to set the bit #0 of q
+ as expected by trigonometric instructions. */
+ .shift = 0x1.800002p+23f
+};
+
+#define RangeVal 0x49800000 /* asuint32(0x1p20f). */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds)
+{
+ return sv_call_f32 (cosf, x, y, out_of_bounds);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+ instructions (FTMAD, FTSSEL, FTSMUL).
+ Maximum measured error: 2.06 ULPs.
+ SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+ want 0x1.fffe76p-6. */
+svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t r = svabs_f32_x (pg, x);
+ svbool_t out_of_bounds
+ = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal);
+
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svfloat32_t negpio2_and_invpio2
+ = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1);
+
+ /* n = rint(|x|/(pi/2)). */
+ svfloat32_t q
+ = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3);
+ svfloat32_t n = svsub_n_f32_x (pg, q, d->shift);
+
+ /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
+ r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0);
+ r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1);
+ r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2);
+
+ /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
+ svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q));
+
+ /* cos(r) poly approx. */
+ svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q));
+ svfloat32_t y = sv_f32 (0.0f);
+ y = svtmad_f32 (y, r2, 4);
+ y = svtmad_f32 (y, r2, 3);
+ y = svtmad_f32 (y, r2, 2);
+ y = svtmad_f32 (y, r2, 1);
+ y = svtmad_f32 (y, r2, 0);
+
+ /* Apply factor. */
+ y = svmul_f32_x (pg, f, y);
+
+ if (__glibc_unlikely (svptest_any (pg, out_of_bounds)))
+ return special_case (x, y, out_of_bounds);
+ return y;
}
diff --git a/sysdeps/aarch64/fpu/sv_math.h b/sysdeps/aarch64/fpu/sv_math.h
new file mode 100644
index 0000000000..fc8e690e80
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_math.h
@@ -0,0 +1,141 @@
+/* Utilities for SVE libmvec routines.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ . */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#include
+#include
+
+#include "vecmath_config.h"
+
+#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f
+#define SV_NAME_D1(fun) _ZGVsMxv_##fun
+#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f
+#define SV_NAME_D2(fun) _ZGVsMxvv_##fun
+
+/* Double precision. */
+static inline svint64_t
+sv_s64 (int64_t x)
+{
+ return svdup_n_s64 (x);
+}
+
+static inline svuint64_t
+sv_u64 (uint64_t x)
+{
+ return svdup_n_u64 (x);
+}
+
+static inline svfloat64_t
+sv_f64 (double x)
+{
+ return svdup_n_f64 (x);
+}
+
+static inline svfloat64_t
+sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ double elem = svclastb_n_f64 (p, 0, x);
+ elem = (*f) (elem);
+ svfloat64_t y2 = svdup_n_f64 (elem);
+ y = svsel_f64 (p, y2, y);
+ p = svpnext_b64 (cmp, p);
+ }
+ return y;
+}
+
+static inline svfloat64_t
+sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2,
+ svfloat64_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ double elem1 = svclastb_n_f64 (p, 0, x1);
+ double elem2 = svclastb_n_f64 (p, 0, x2);
+ double ret = (*f) (elem1, elem2);
+ svfloat64_t y2 = svdup_n_f64 (ret);
+ y = svsel_f64 (p, y2, y);
+ p = svpnext_b64 (cmp, p);
+ }
+ return y;
+}
+
+static inline svuint64_t
+sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y)
+{
+ svuint64_t q = svdiv_n_u64_x (pg, x, y);
+ return svmls_n_u64_x (pg, x, q, y);
+}
+
+/* Single precision. */
+static inline svint32_t
+sv_s32 (int32_t x)
+{
+ return svdup_n_s32 (x);
+}
+
+static inline svuint32_t
+sv_u32 (uint32_t x)
+{
+ return svdup_n_u32 (x);
+}
+
+static inline svfloat32_t
+sv_f32 (float x)
+{
+ return svdup_n_f32 (x);
+}
+
+static inline svfloat32_t
+sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ float elem = svclastb_n_f32 (p, 0, x);
+ elem = f (elem);
+ svfloat32_t y2 = svdup_n_f32 (elem);
+ y = svsel_f32 (p, y2, y);
+ p = svpnext_b32 (cmp, p);
+ }
+ return y;
+}
+
+static inline svfloat32_t
+sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2,
+ svfloat32_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ float elem1 = svclastb_n_f32 (p, 0, x1);
+ float elem2 = svclastb_n_f32 (p, 0, x2);
+ float ret = f (elem1, elem2);
+ svfloat32_t y2 = svdup_n_f32 (ret);
+ y = svsel_f32 (p, y2, y);
+ p = svpnext_b32 (cmp, p);
+ }
+ return y;
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/sve_utils.h b/sysdeps/aarch64/fpu/sve_utils.h
deleted file mode 100644
index 5ce3d2e8d6..0000000000
--- a/sysdeps/aarch64/fpu/sve_utils.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Helpers for SVE vector math functions.
-
- Copyright (C) 2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- . */
-
-#include
-
-#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f
-#define SV_NAME_D1(fun) _ZGVsMxv_##fun
-#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f
-#define SV_NAME_D2(fun) _ZGVsMxvv_##fun
-
-static __always_inline svfloat32_t
-sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp)
-{
- svbool_t p = svpfirst (cmp, svpfalse ());
- while (svptest_any (cmp, p))
- {
- float elem = svclastb_n_f32 (p, 0, x);
- elem = (*f) (elem);
- svfloat32_t y2 = svdup_n_f32 (elem);
- y = svsel_f32 (p, y2, y);
- p = svpnext_b32 (cmp, p);
- }
- return y;
-}
-
-static __always_inline svfloat64_t
-sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp)
-{
- svbool_t p = svpfirst (cmp, svpfalse ());
- while (svptest_any (cmp, p))
- {
- double elem = svclastb_n_f64 (p, 0, x);
- elem = (*f) (elem);
- svfloat64_t y2 = svdup_n_f64 (elem);
- y = svsel_f64 (p, y2, y);
- p = svpnext_b64 (cmp, p);
- }
- return y;
-}
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
new file mode 100644
index 0000000000..43efd8f99d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -0,0 +1,145 @@
+/* Utilities for Advanced SIMD libmvec routines.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ . */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#include
+#include "vecmath_config.h"
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+
+/* Shorthand helpers for declaring constants. */
+#define V2(x) \
+ { \
+ x, x \
+ }
+
+#define V4(x) \
+ { \
+ x, x, x, x \
+ }
+
+static inline float32x4_t
+v_f32 (float x)
+{
+ return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+ return (uint32x4_t) V4 (x);
+}
+static inline int32x4_t
+v_s32 (int32_t x)
+{
+ return (int32x4_t) V4 (x);
+}
+
+/* true if any elements of a vector compare result is non-zero. */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+ return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+ return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2],
+ p[3] ? f (x1[3], x2[3]) : y[3] };
+}
+
+static inline float64x2_t
+v_f64 (double x)
+{
+ return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+ return (uint64x2_t) V2 (x);
+}
+static inline int64x2_t
+v_s64 (int64_t x)
+{
+ return (int64x2_t) V2 (x);
+}
+
+/* true if any elements of a vector compare result is non-zero. */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (x) != 0;
+}
+/* true if all elements of a vector compare result is 1. */
+static inline int
+v_all_u64 (uint64x2_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+ return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+ return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+ return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] };
+}
+static inline float64x2_t
+v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
+ float64x2_t y, uint64x2_t p)
+{
+ return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1] };
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
new file mode 100644
index 0000000000..d0bdbb4ae8
--- /dev/null
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -0,0 +1,38 @@
+/* Configuration for libmvec routines.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ . */
+
+#ifndef _VECMATH_CONFIG_H
+#define _VECMATH_CONFIG_H
+
+#include
+
+/* Deprecated config option from Arm Optimized Routines which ensures
+ fp exceptions are correctly triggered. This is not intended to be
+ supported in GLIBC, however we keep it for ease of development. */
+#define WANT_SIMD_EXCEPT 0
+
+/* Return ptr but hide its value from the compiler so accesses through it
+ cannot be optimized based on the contents. */
+#define ptr_barrier(ptr) \
+ ({ \
+ __typeof (ptr) __ptr = (ptr); \
+ __asm("" : "+r"(__ptr)); \
+ __ptr; \
+ })
+
+#endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index da7c64942c..07da4ab843 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -642,7 +642,7 @@ float: 1
ldouble: 2
Function: "cos_advsimd":
-double: 1
+double: 2
float: 1
Function: "cos_downward":
diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
index 98b54875f5..3865dcf905 100644
--- a/sysdeps/generic/math_private.h
+++ b/sysdeps/generic/math_private.h
@@ -193,7 +193,7 @@ do { \
# undef _Mdouble_
#endif
-
+#define NOINLINE __attribute__ ((noinline))
/* Prototypes for functions of the IBM Accurate Mathematical Library. */
extern double __sin (double __x);
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 499ca7591b..19af33fd86 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -148,9 +148,6 @@ make_double (uint64_t x, int64_t ep, uint64_t s)
return asdouble (s + x + (ep << MANTISSA_WIDTH));
}
-
-#define NOINLINE __attribute__ ((noinline))
-
/* Error handling tail calls for special cases, with a sign argument.
The sign of the return value is set if the argument is non-zero. */
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
index a7fb332767..d1b06a1a90 100644
--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -151,8 +151,6 @@ make_float (uint32_t x, int ep, uint32_t s)
return asfloat (s + x + (ep << MANTISSA_WIDTH));
}
-#define NOINLINE __attribute__ ((noinline))
-
attribute_hidden float __math_oflowf (uint32_t);
attribute_hidden float __math_uflowf (uint32_t);
attribute_hidden float __math_may_uflowf (uint32_t);