aarch64/fpu: Add vector variants of cosh

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
2025-03-06 20:58:33 +01:00 · 2024-02-20 16:59:39 +00:00 · 2024-02-20 16:59:39 +00:00 · bdb5705b7b
commit bdb5705b7b
parent cb5d84f1f8
18 changed files with 648 additions and 1 deletions
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
                          atan \
                          atan2 \
                          cos \
+                          cosh \
                          erf \
                          exp \
                          exp10 \
@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                  erf_data \
                  erff_data \
                  sv_erf_data \
-                  sv_erff_data
+                  sv_erff_data \
+                  v_exp_tail_data
 endif

 sve-cflags = -march=armv8-a+sve
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@ -79,6 +79,11 @@ libmvec {
    _ZGVsMxv_tan;
  }
  GLIBC_2.40 {
+    _ZGVnN2v_cosh;
+    _ZGVnN2v_coshf;
+    _ZGVnN4v_coshf;
+    _ZGVsMxv_cosh;
+    _ZGVsMxv_coshf;
    _ZGVnN2v_erf;
    _ZGVnN2v_erff;
    _ZGVnN4v_erff;
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos));
 libmvec_hidden_proto (V_NAME_F1(asin));
 libmvec_hidden_proto (V_NAME_F1(atan));
 libmvec_hidden_proto (V_NAME_F1(cos));
+libmvec_hidden_proto (V_NAME_F1(cosh));
 libmvec_hidden_proto (V_NAME_F1(erf));
 libmvec_hidden_proto (V_NAME_F1(exp10));
 libmvec_hidden_proto (V_NAME_F1(exp2));
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@ -49,6 +49,10 @@
 # define __DECL_SIMD_cos __DECL_SIMD_aarch64
 # undef __DECL_SIMD_cosf
 # define __DECL_SIMD_cosf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cosh
+# define __DECL_SIMD_cosh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_coshf
+# define __DECL_SIMD_coshf __DECL_SIMD_aarch64
 # undef __DECL_SIMD_erf
 # define __DECL_SIMD_erf __DECL_SIMD_aarch64
 # undef __DECL_SIMD_erff
@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
@ -0,0 +1,108 @@
+/* Double-precision vector (AdvSIMD) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2, shift, thres;
+  uint64x2_t index_mask, special_bound;
+} data = {
+  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+	    V2 (0x1.5555576a59599p-5), },
+
+  .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+  /* -ln2/N.  */
+  .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+  .shift = V2 (0x1.8p+52),
+  .thres = V2 (704.0),
+
+  .index_mask = V2 (0xff),
+  /* 0x1.6p9, above which exp overflows.  */
+  .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+   special-case handling or tail.  */
+static inline float64x2_t
+exp_inline (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* n = round(x/(ln2/N)).  */
+  float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+  uint64x2_t i = vandq_u64 (u, d->index_mask);
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+  y = vfmaq_f64 (d->poly[0], y, r);
+  y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (__v_exp_tail_data, i);
+  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  return vfmaq_f64 (s, y, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the
+   same as the scalar routine, 1.93 ULP:
+   _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				       want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				       want 0x1.f711dcb0c77b1p+7.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t special
+      = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
+
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  float64x2_t t = exp_inline (ax);
+  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+  /* Fall back to scalar for any special cases.  */
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+  return vaddq_f64 (half_t, half_over_t);
+}
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@ -0,0 +1,105 @@
+/* Double-precision vector (SVE) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t poly[3];
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  uint64_t index_mask, special_bound;
+} data = {
+  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+	    0x1.5555576a59599p-5, },
+
+  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
+  /* -ln2/N.  */
+  .ln2_hi = -0x1.62e42fefa39efp-9,
+  .ln2_lo = -0x1.abc9e3b39803f3p-64,
+  .shift = 0x1.8p+52,
+  .thres = 704.0,
+
+  .index_mask = 0xff,
+  /* 0x1.6p9, above which exp overflows.  */
+  .special_bound = 0x4086000000000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+   special-case handling or tail.  */
+static inline svfloat64_t
+exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+  /* Calculate exp(x).  */
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+  r = svmla_x (pg, r, n, d->ln2_lo);
+
+  svuint64_t u = svreinterpret_u64 (z);
+  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+  svuint64_t i = svand_x (pg, u, d->index_mask);
+
+  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+  y = svmla_x (pg, sv_f64 (1.0), r, y);
+  y = svmul_x (pg, r, y);
+
+  /* s = 2^(n/N).  */
+  u = svld1_gather_index (pg, __v_exp_tail_data, i);
+  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+
+  return svmla_x (pg, s, s, y);
+}
+
+/* Approximation for SVE double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the
+   same as the scalar routine, 1.93 ULP:
+   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+				       want 0x1.fd774e958236fp+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+				       want 0x1.f5e2bb8d5c991p+8.  */
+svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  svfloat64_t t = exp_inline (ax, pg, d);
+  svfloat64_t half_t = svmul_x (pg, t, 0.5);
+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+
+  /* Fall back to scalar for any special cases.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+  return svadd_x (pg, half_t, half_over_t);
+}
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@ -0,0 +1,84 @@
+/* Single-precision vector (AdvSIMD) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_expf_inline.h"
+#include "v_math.h"
+
+static const struct data
+{
+  struct v_expf_data expf_consts;
+  uint32x4_t tiny_bound, special_bound;
+} data = {
+  .expf_consts = V_EXPF_DATA,
+  .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
+  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .special_bound = V4 (0x42ad496c),
+};
+
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (coshf, x, y, special);
+}
+#endif
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+				 want 0x1.6a4922p+4.  */
+float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows.  */
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 0, which will generate no exceptions.  */
+  if (__glibc_unlikely (v_any_u32 (tiny)))
+    ax = v_zerofy_f32 (ax, tiny);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+  float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+  float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+
+#if WANT_SIMD_EXCEPT
+  if (__glibc_unlikely (v_any_u32 (tiny)))
+    return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+#else
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+#endif
+
+  return vaddq_f32 (half_t, half_over_t);
+}
+libmvec_hidden_def (V_NAME_F1 (cosh))
+HALF_WIDTH_ALIAS_F1 (cosh)
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@ -0,0 +1,59 @@
+/* Single-precision vector (SVE) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "sv_expf_inline.h"
+
+static const struct data
+{
+  struct sv_expf_data expf_consts;
+  uint32_t special_bound;
+} data = {
+  .expf_consts = SV_EXPF_DATA,
+  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .special_bound = 0x42ad496c,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+  return sv_call_f32 (coshf, x, y, pg);
+}
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 1.89 ULP:
+   _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+				  want 0x1.f00adcp+127.  */
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+  svfloat32_t half_t = svmul_x (pg, t, 0.5);
+  svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+  return svadd_x (pg, half_t, half_over_t);
+}
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@ -0,0 +1,75 @@
+/* SVE helper for single-precision routines which depend on exp
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_SV_EXPF_INLINE_H
+#define AARCH64_FPU_SV_EXPF_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expf_data
+{
+  float poly[5];
+  float inv_ln2, ln2_hi, ln2_lo, shift;
+};
+
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+   compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+#define SV_EXPF_DATA                                                          \
+  {                                                                           \
+    .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+	      0x1.0e4020p-7f },                                               \
+                                                                              \
+    .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,                        \
+    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+  }
+
+#define C(i) sv_f32 (d->poly[i])
+
+static inline svfloat32_t
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+{
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* Load some constants in quad-word chunks to minimise memory access.  */
+  svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+
+  /* n = round(x/(ln2/N)).  */
+  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+  svfloat32_t n = svsub_x (pg, z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+  r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+
+  /* scale = 2^(n/N).  */
+  svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+  svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+  return svmla_x (pg, scale, scale, poly);
+}
+
+#endif
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
 VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
 VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
 VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
 VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
 VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
 SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
 SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
 SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
 SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
 SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
 VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
 VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
 VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
 VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
 VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
 SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
 SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
 SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
 SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
 SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
--- a/sysdeps/aarch64/fpu/v_exp_tail_data.c
+++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c
@ -0,0 +1,110 @@
+/* Lookup table for high-precision exp(x, tail) function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "vecmath_config.h"
+
+/* 2^(j/N), j=0..N, N=2^8=256.  */
+const uint64_t __v_exp_tail_data[] = {
+  0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+  0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+  0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+  0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+  0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+  0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+  0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+  0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+  0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+  0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+  0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+  0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+  0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+  0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+  0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+  0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+  0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+  0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+  0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+  0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+  0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+  0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+  0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+  0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+  0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+  0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+  0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+  0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+  0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+  0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+  0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+  0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+  0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+  0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+  0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+  0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+  0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+  0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+  0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+  0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+  0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+  0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+  0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+  0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+  0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+  0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+  0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+  0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+  0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+  0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+  0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+  0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+  0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+  0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+  0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+  0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+  0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+  0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+  0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+  0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+  0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+  0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+  0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+  0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+  0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+  0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+  0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+  0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+  0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+  0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+  0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+  0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+  0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+  0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+  0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+  0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+  0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+  0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+  0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+  0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+  0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+  0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+  0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+  0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+  0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+  0x3feff9d96b2a23d9,
+};
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@ -0,0 +1,71 @@
+/* Helper for single-precision AdvSIMD routines which depend on exp
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_EXPF_INLINE_H
+#define AARCH64_FPU_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+  float32x4_t poly[5];
+  float32x4_t shift, invln2_and_ln2;
+};
+
+/* maxerr: 1.45358 +0.5 ulp.  */
+#define V_EXPF_DATA                                                           \
+  {                                                                           \
+    .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),  \
+	      V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },                     \
+    .shift = V4 (0x1.8p23f),                                                  \
+    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+  }
+
+#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f).  */
+#define C(i) d->poly[i]
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+  /* Helper routine for calculating exp(x).
+     Copied from v_expf.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t n, r, z;
+  z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+  n = vsubq_f32 (z, d->shift);
+  r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
+  r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+
+  /* Custom order-4 Estrin avoids building high order monomial.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p, q, poly;
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@ -59,6 +59,8 @@ extern const struct v_log_data
  } table[1 << V_LOG_TABLE_BITS];
 } __v_log_data attribute_hidden;

+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden;
 #define V_EXP_TABLE_BITS 7
 extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;

--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@ -701,11 +701,19 @@ double: 2
 float: 2
 ldouble: 2

+Function: "cosh_advsimd":
+double: 2
+float: 2
+
 Function: "cosh_downward":
 double: 3
 float: 1
 ldouble: 3

+Function: "cosh_sve":
+double: 2
+float: 2
+
 Function: "cosh_towardzero":
 double: 3
 float: 1
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F
 GLIBC_2.39 _ZGVsMxv_tanf F
 GLIBC_2.39 _ZGVsMxvv_atan2 F
 GLIBC_2.39 _ZGVsMxvv_atan2f F
+GLIBC_2.40 _ZGVnN2v_cosh F
+GLIBC_2.40 _ZGVnN2v_coshf F
 GLIBC_2.40 _ZGVnN2v_erf F
 GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN4v_coshf F
 GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVsMxv_cosh F
+GLIBC_2.40 _ZGVsMxv_coshf F
 GLIBC_2.40 _ZGVsMxv_erf F
 GLIBC_2.40 _ZGVsMxv_erff F