mirror of
git://sourceware.org/git/glibc.git
synced 2025-03-06 20:58:33 +01:00
aarch64/fpu: Add vector variants of cosh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
parent
cb5d84f1f8
commit
bdb5705b7b
18 changed files with 648 additions and 1 deletions
|
@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
|
|||
atan \
|
||||
atan2 \
|
||||
cos \
|
||||
cosh \
|
||||
erf \
|
||||
exp \
|
||||
exp10 \
|
||||
|
@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
|
|||
erf_data \
|
||||
erff_data \
|
||||
sv_erf_data \
|
||||
sv_erff_data
|
||||
sv_erff_data \
|
||||
v_exp_tail_data
|
||||
endif
|
||||
|
||||
sve-cflags = -march=armv8-a+sve
|
||||
|
|
|
@ -79,6 +79,11 @@ libmvec {
|
|||
_ZGVsMxv_tan;
|
||||
}
|
||||
GLIBC_2.40 {
|
||||
_ZGVnN2v_cosh;
|
||||
_ZGVnN2v_coshf;
|
||||
_ZGVnN4v_coshf;
|
||||
_ZGVsMxv_cosh;
|
||||
_ZGVsMxv_coshf;
|
||||
_ZGVnN2v_erf;
|
||||
_ZGVnN2v_erff;
|
||||
_ZGVnN4v_erff;
|
||||
|
|
|
@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos));
|
|||
libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
libmvec_hidden_proto (V_NAME_F1(cosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(erf));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp10));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2));
|
||||
|
|
|
@ -49,6 +49,10 @@
|
|||
# define __DECL_SIMD_cos __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_cosf
|
||||
# define __DECL_SIMD_cosf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_cosh
|
||||
# define __DECL_SIMD_cosh __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_coshf
|
||||
# define __DECL_SIMD_coshf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_erf
|
||||
# define __DECL_SIMD_erf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_erff
|
||||
|
@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
|
|||
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
|
||||
|
@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
|
|||
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
|
||||
|
@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
|
|||
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
|
||||
|
@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
|
|||
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
|
||||
|
|
108
sysdeps/aarch64/fpu/cosh_advsimd.c
Normal file
108
sysdeps/aarch64/fpu/cosh_advsimd.c
Normal file
|
@ -0,0 +1,108 @@
|
|||
/* Double-precision vector (AdvSIMD) cosh function
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[3];
|
||||
float64x2_t inv_ln2, ln2, shift, thres;
|
||||
uint64x2_t index_mask, special_bound;
|
||||
} data = {
|
||||
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
|
||||
V2 (0x1.5555576a59599p-5), },
|
||||
|
||||
.inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
|
||||
/* -ln2/N. */
|
||||
.ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
|
||||
.shift = V2 (0x1.8p+52),
|
||||
.thres = V2 (704.0),
|
||||
|
||||
.index_mask = V2 (0xff),
|
||||
/* 0x1.6p9, above which exp overflows. */
|
||||
.special_bound = V2 (0x4086000000000000),
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
{
|
||||
return v_call_f64 (cosh, x, y, special);
|
||||
}
|
||||
|
||||
/* Helper for approximating exp(x). Copied from v_exp_tail, with no
|
||||
special-case handling or tail. */
|
||||
static inline float64x2_t
|
||||
exp_inline (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
|
||||
r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
|
||||
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
uint64x2_t i = vandq_u64 (u, d->index_mask);
|
||||
|
||||
/* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
|
||||
y = vfmaq_f64 (d->poly[0], y, r);
|
||||
y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
|
||||
|
||||
/* s = 2^(n/N). */
|
||||
u = v_lookup_u64 (__v_exp_tail_data, i);
|
||||
float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
|
||||
|
||||
return vfmaq_f64 (s, y, s);
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision cosh(x) using exp_inline.
|
||||
cosh(x) = (exp(x) + exp(-x)) / 2.
|
||||
The greatest observed error is in the scalar fall-back region, so is the
|
||||
same as the scalar routine, 1.93 ULP:
|
||||
_ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
|
||||
want 0x1.fdf28623ef923p+1021.
|
||||
|
||||
The greatest observed error in the non-special region is 1.54 ULP:
|
||||
_ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
|
||||
want 0x1.f711dcb0c77b1p+7. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
uint64x2_t special
|
||||
= vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
|
||||
|
||||
/* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
float64x2_t t = exp_inline (ax);
|
||||
float64x2_t half_t = vmulq_n_f64 (t, 0.5);
|
||||
float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
|
||||
|
||||
/* Fall back to scalar for any special cases. */
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
return special_case (x, vaddq_f64 (half_t, half_over_t), special);
|
||||
|
||||
return vaddq_f64 (half_t, half_over_t);
|
||||
}
|
105
sysdeps/aarch64/fpu/cosh_sve.c
Normal file
105
sysdeps/aarch64/fpu/cosh_sve.c
Normal file
|
@ -0,0 +1,105 @@
|
|||
/* Double-precision vector (SVE) cosh function
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float64_t poly[3];
|
||||
float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
|
||||
uint64_t index_mask, special_bound;
|
||||
} data = {
|
||||
.poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
|
||||
0x1.5555576a59599p-5, },
|
||||
|
||||
.inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
|
||||
/* -ln2/N. */
|
||||
.ln2_hi = -0x1.62e42fefa39efp-9,
|
||||
.ln2_lo = -0x1.abc9e3b39803f3p-64,
|
||||
.shift = 0x1.8p+52,
|
||||
.thres = 704.0,
|
||||
|
||||
.index_mask = 0xff,
|
||||
/* 0x1.6p9, above which exp overflows. */
|
||||
.special_bound = 0x4086000000000000,
|
||||
};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
{
|
||||
return sv_call_f64 (cosh, x, y, special);
|
||||
}
|
||||
|
||||
/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
|
||||
special-case handling or tail. */
|
||||
static inline svfloat64_t
|
||||
exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
{
|
||||
/* Calculate exp(x). */
|
||||
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
|
||||
r = svmla_x (pg, r, n, d->ln2_lo);
|
||||
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
svuint64_t i = svand_x (pg, u, d->index_mask);
|
||||
|
||||
svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
|
||||
y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
|
||||
y = svmla_x (pg, sv_f64 (1.0), r, y);
|
||||
y = svmul_x (pg, r, y);
|
||||
|
||||
/* s = 2^(n/N). */
|
||||
u = svld1_gather_index (pg, __v_exp_tail_data, i);
|
||||
svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
|
||||
|
||||
return svmla_x (pg, s, s, y);
|
||||
}
|
||||
|
||||
/* Approximation for SVE double-precision cosh(x) using exp_inline.
|
||||
cosh(x) = (exp(x) + exp(-x)) / 2.
|
||||
The greatest observed error is in the scalar fall-back region, so is the
|
||||
same as the scalar routine, 1.93 ULP:
|
||||
_ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
|
||||
want 0x1.fd774e958236fp+1021.
|
||||
|
||||
The greatest observed error in the non-special region is 1.54 ULP:
|
||||
_ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
|
||||
want 0x1.f5e2bb8d5c991p+8. */
|
||||
svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
svfloat64_t ax = svabs_x (pg, x);
|
||||
svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
|
||||
|
||||
/* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
svfloat64_t t = exp_inline (ax, pg, d);
|
||||
svfloat64_t half_t = svmul_x (pg, t, 0.5);
|
||||
svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
|
||||
/* Fall back to scalar for any special cases. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
|
||||
return svadd_x (pg, half_t, half_over_t);
|
||||
}
|
84
sysdeps/aarch64/fpu/coshf_advsimd.c
Normal file
84
sysdeps/aarch64/fpu/coshf_advsimd.c
Normal file
|
@ -0,0 +1,84 @@
|
|||
/* Single-precision vector (AdvSIMD) cosh function
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_expf_inline.h"
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct v_expf_data expf_consts;
|
||||
uint32x4_t tiny_bound, special_bound;
|
||||
} data = {
|
||||
.expf_consts = V_EXPF_DATA,
|
||||
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
|
||||
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
.special_bound = V4 (0x42ad496c),
|
||||
};
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
{
|
||||
return v_call_f32 (coshf, x, y, special);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Single-precision vector cosh, using vector expf.
|
||||
Maximum error is 2.38 ULP:
|
||||
_ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
|
||||
want 0x1.6a4922p+4. */
|
||||
float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered correctly, fall back to the scalar
|
||||
variant for all inputs if any input is a special value or above the bound
|
||||
at which expf overflows. */
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return v_call_f32 (coshf, x, x, v_u32 (-1));
|
||||
|
||||
uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
|
||||
/* If any input is tiny, avoid underflow exception by fixing tiny lanes of
|
||||
input to 0, which will generate no exceptions. */
|
||||
if (__glibc_unlikely (v_any_u32 (tiny)))
|
||||
ax = v_zerofy_f32 (ax, tiny);
|
||||
#endif
|
||||
|
||||
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
|
||||
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
if (__glibc_unlikely (v_any_u32 (tiny)))
|
||||
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
|
||||
#else
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return special_case (x, vaddq_f32 (half_t, half_over_t), special);
|
||||
#endif
|
||||
|
||||
return vaddq_f32 (half_t, half_over_t);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (cosh))
|
||||
HALF_WIDTH_ALIAS_F1 (cosh)
|
59
sysdeps/aarch64/fpu/coshf_sve.c
Normal file
59
sysdeps/aarch64/fpu/coshf_sve.c
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* Single-precision vector (SVE) cosh function
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
#include "sv_expf_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
struct sv_expf_data expf_consts;
|
||||
uint32_t special_bound;
|
||||
} data = {
|
||||
.expf_consts = SV_EXPF_DATA,
|
||||
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
.special_bound = 0x42ad496c,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
|
||||
{
|
||||
return sv_call_f32 (coshf, x, y, pg);
|
||||
}
|
||||
|
||||
/* Single-precision vector cosh, using vector expf.
|
||||
Maximum error is 1.89 ULP:
|
||||
_ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
|
||||
want 0x1.f00adcp+127. */
|
||||
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
svfloat32_t ax = svabs_x (pg, x);
|
||||
svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
|
||||
|
||||
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
|
||||
svfloat32_t half_t = svmul_x (pg, t, 0.5);
|
||||
svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
|
||||
return svadd_x (pg, half_t, half_over_t);
|
||||
}
|
75
sysdeps/aarch64/fpu/sv_expf_inline.h
Normal file
75
sysdeps/aarch64/fpu/sv_expf_inline.h
Normal file
|
@ -0,0 +1,75 @@
|
|||
/* SVE helper for single-precision routines which depend on exp
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef AARCH64_FPU_SV_EXPF_INLINE_H
|
||||
#define AARCH64_FPU_SV_EXPF_INLINE_H
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
struct sv_expf_data
|
||||
{
|
||||
float poly[5];
|
||||
float inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
};
|
||||
|
||||
/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
|
||||
#define SV_EXPF_DATA \
|
||||
{ \
|
||||
.poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
|
||||
0x1.0e4020p-7f }, \
|
||||
\
|
||||
.inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
.ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
|
||||
}
|
||||
|
||||
#define C(i) sv_f32 (d->poly[i])
|
||||
|
||||
static inline svfloat32_t
|
||||
expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
{
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
/* Load some constants in quad-word chunks to minimise memory access. */
|
||||
svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
|
||||
svfloat32_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
|
||||
r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
|
||||
|
||||
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
|
||||
svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
|
||||
svfloat32_t r2 = svmul_f32_x (pg, r, r);
|
||||
svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
|
||||
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
|
|||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
|
||||
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
|
||||
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
|
||||
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
|
||||
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
|
||||
|
|
|
@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
|
|||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
|
||||
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
|
||||
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
|
||||
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
|
||||
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
|
||||
|
|
|
@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
|
|||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
|
||||
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
|
||||
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
|
||||
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
|
||||
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
|
||||
|
|
|
@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
|
|||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
|
||||
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
|
||||
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
|
||||
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
|
||||
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
|
||||
|
|
110
sysdeps/aarch64/fpu/v_exp_tail_data.c
Normal file
110
sysdeps/aarch64/fpu/v_exp_tail_data.c
Normal file
|
@ -0,0 +1,110 @@
|
|||
/* Lookup table for high-precision exp(x, tail) function
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "vecmath_config.h"
|
||||
|
||||
/* 2^(j/N), j=0..N, N=2^8=256. */
|
||||
const uint64_t __v_exp_tail_data[] = {
|
||||
0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
|
||||
0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
|
||||
0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
|
||||
0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
|
||||
0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
|
||||
0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
|
||||
0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
|
||||
0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
|
||||
0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
|
||||
0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
|
||||
0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
|
||||
0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
|
||||
0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
|
||||
0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
|
||||
0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
|
||||
0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
|
||||
0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
|
||||
0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
|
||||
0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
|
||||
0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
|
||||
0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
|
||||
0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
|
||||
0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
|
||||
0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
|
||||
0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
|
||||
0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
|
||||
0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
|
||||
0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
|
||||
0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
|
||||
0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
|
||||
0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
|
||||
0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
|
||||
0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
|
||||
0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
|
||||
0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
|
||||
0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
|
||||
0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
|
||||
0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
|
||||
0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
|
||||
0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
|
||||
0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
|
||||
0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
|
||||
0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
|
||||
0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
|
||||
0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
|
||||
0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
|
||||
0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
|
||||
0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
|
||||
0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
|
||||
0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
|
||||
0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
|
||||
0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
|
||||
0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
|
||||
0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
|
||||
0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
|
||||
0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
|
||||
0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
|
||||
0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
|
||||
0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
|
||||
0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
|
||||
0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
|
||||
0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
|
||||
0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
|
||||
0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
|
||||
0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
|
||||
0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
|
||||
0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
|
||||
0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
|
||||
0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
|
||||
0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
|
||||
0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
|
||||
0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
|
||||
0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
|
||||
0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
|
||||
0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
|
||||
0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
|
||||
0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
|
||||
0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
|
||||
0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
|
||||
0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
|
||||
0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
|
||||
0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
|
||||
0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
|
||||
0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
|
||||
0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
|
||||
0x3feff9d96b2a23d9,
|
||||
};
|
71
sysdeps/aarch64/fpu/v_expf_inline.h
Normal file
71
sysdeps/aarch64/fpu/v_expf_inline.h
Normal file
|
@ -0,0 +1,71 @@
|
|||
/* Helper for single-precision AdvSIMD routines which depend on exp
|
||||
|
||||
Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef AARCH64_FPU_V_EXPF_INLINE_H
|
||||
#define AARCH64_FPU_V_EXPF_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
struct v_expf_data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
float32x4_t shift, invln2_and_ln2;
|
||||
};
|
||||
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
#define V_EXPF_DATA \
|
||||
{ \
|
||||
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
|
||||
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
|
||||
.shift = V4 (0x1.8p23f), \
|
||||
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
}
|
||||
|
||||
#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
|
||||
#define C(i) d->poly[i]
|
||||
|
||||
static inline float32x4_t
|
||||
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
|
||||
{
|
||||
/* Helper routine for calculating exp(x).
|
||||
Copied from v_expf.c, with all special-case handling removed - the
|
||||
calling routine should handle special values if required. */
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
float32x4_t n, r, z;
|
||||
z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
|
||||
n = vsubq_f32 (z, d->shift);
|
||||
r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
|
||||
r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
|
||||
/* Custom order-4 Estrin avoids building high order monomial. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p, q, poly;
|
||||
p = vfmaq_f32 (C (1), C (0), r);
|
||||
q = vfmaq_f32 (C (3), C (2), r);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (C (4), r);
|
||||
poly = vfmaq_f32 (p, q, r2);
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -59,6 +59,8 @@ extern const struct v_log_data
|
|||
} table[1 << V_LOG_TABLE_BITS];
|
||||
} __v_log_data attribute_hidden;
|
||||
|
||||
#define V_EXP_TAIL_TABLE_BITS 8
|
||||
extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden;
|
||||
#define V_EXP_TABLE_BITS 7
|
||||
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;
|
||||
|
||||
|
|
|
@ -701,11 +701,19 @@ double: 2
|
|||
float: 2
|
||||
ldouble: 2
|
||||
|
||||
Function: "cosh_advsimd":
|
||||
double: 2
|
||||
float: 2
|
||||
|
||||
Function: "cosh_downward":
|
||||
double: 3
|
||||
float: 1
|
||||
ldouble: 3
|
||||
|
||||
Function: "cosh_sve":
|
||||
double: 2
|
||||
float: 2
|
||||
|
||||
Function: "cosh_towardzero":
|
||||
double: 3
|
||||
float: 1
|
||||
|
|
|
@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F
|
|||
GLIBC_2.39 _ZGVsMxv_tanf F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2 F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2f F
|
||||
GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
GLIBC_2.40 _ZGVsMxv_erff F
|
||||
|
|
Loading…
Add table
Reference in a new issue