aarch64/fpu: Sync libmvec routines from 2.39 and before with AOR

This includes a fix for big-endian in AdvSIMD log, some cosmetic
changes, and numerous small optimisations mainly around inlining and
using indexed variants of MLA intrinsics.
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
This commit is contained in:
Joe Ramsay 2024-02-20 16:44:13 +00:00 committed by Adhemerval Zanella
parent 02782fd128
commit e302e10213
18 changed files with 111 additions and 105 deletions

View file

@ -40,8 +40,8 @@ static const struct data
}; };
#define AllMask v_u64 (0xffffffffffffffff) #define AllMask v_u64 (0xffffffffffffffff)
#define Oneu (0x3ff0000000000000) #define Oneu 0x3ff0000000000000
#define Small (0x3e50000000000000) /* 2^-53. */ #define Small 0x3e50000000000000 /* 2^-53. */
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE static float64x2_t VPCS_ATTR NOINLINE

View file

@ -39,8 +39,8 @@ static const struct data
}; };
#define AllMask v_u64 (0xffffffffffffffff) #define AllMask v_u64 (0xffffffffffffffff)
#define One (0x3ff0000000000000) #define One 0x3ff0000000000000
#define Small (0x3e50000000000000) /* 2^-12. */ #define Small 0x3e50000000000000 /* 2^-12. */
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE static float64x2_t VPCS_ATTR NOINLINE

View file

@ -37,9 +37,6 @@ static const struct data
.pi_over_2 = 0x1.921fb54442d18p+0, .pi_over_2 = 0x1.921fb54442d18p+0,
}; };
/* Useful constants. */
#define SignMask sv_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
static svfloat64_t NOINLINE static svfloat64_t NOINLINE
special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
svuint64_t sign_x = svand_x (pg, ix, SignMask);
svuint64_t sign_y = svand_x (pg, iy, SignMask);
svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
svfloat64_t ax = svabs_x (pg, x); svfloat64_t ax = svabs_x (pg, x);
svfloat64_t ay = svabs_x (pg, y); svfloat64_t ay = svabs_x (pg, y);
svuint64_t iax = svreinterpret_u64 (ax);
svuint64_t iay = svreinterpret_u64 (ay);
svuint64_t sign_x = sveor_x (pg, ix, iax);
svuint64_t sign_y = sveor_x (pg, iy, iay);
svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax); svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */ /* Set up z for call to atan. */
@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svfloat64_t z = svdiv_x (pg, n, d); svfloat64_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */ /* Work out the correct shift. */
svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
shift = svmul_x (pg, shift, data_ptr->pi_over_2); shift = svmul_x (pg, shift, data_ptr->pi_over_2);
/* Use split Estrin scheme for P(z^2) with deg(P)=19. */ /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift); ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */ /* Account for the sign of x and y. */
ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
if (__glibc_unlikely (svptest_any (pg, cmp_xy))) if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
return special_case (y, x, ret, cmp_xy); return special_case (
y, x,
return ret; svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
cmp_xy);
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
} }

View file

@ -32,10 +32,8 @@ static const struct data
.pi_over_2 = 0x1.921fb6p+0f, .pi_over_2 = 0x1.921fb6p+0f,
}; };
#define SignMask sv_u32 (0x80000000)
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
static inline svfloat32_t static svfloat32_t NOINLINE
special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
const svbool_t cmp) const svbool_t cmp)
{ {
@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
svuint32_t sign_x = svand_x (pg, ix, SignMask);
svuint32_t sign_y = svand_x (pg, iy, SignMask);
svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
svfloat32_t ax = svabs_x (pg, x); svfloat32_t ax = svabs_x (pg, x);
svfloat32_t ay = svabs_x (pg, y); svfloat32_t ay = svabs_x (pg, y);
svuint32_t iax = svreinterpret_u32 (ax);
svuint32_t iay = svreinterpret_u32 (ay);
svuint32_t sign_x = sveor_x (pg, ix, iax);
svuint32_t sign_y = sveor_x (pg, iy, iay);
svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax); svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */ /* Set up z for call to atan. */
@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svfloat32_t z = svdiv_x (pg, n, d); svfloat32_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */ /* Work out the correct shift. */
svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
/* Use split Estrin scheme for P(z^2) with deg(P)=7. */ /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */
svfloat32_t z2 = svmul_x (pg, z, z); svfloat32_t z2 = svmul_x (pg, z, z);
svfloat32_t z4 = svmul_x (pg, z2, z2); svfloat32_t z4 = svmul_x (pg, z2, z2);
svfloat32_t z8 = svmul_x (pg, z4, z4); svfloat32_t z8 = svmul_x (pg, z4, z4);
@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift); ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */ /* Account for the sign of x and y. */
ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
if (__glibc_unlikely (svptest_any (pg, cmp_xy))) if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
return special_case (y, x, ret, cmp_xy); return special_case (
y, x,
svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
cmp_xy);
return ret; return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
} }

View file

@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
special-case handler later. */ special-case handler later. */
r = vbslq_f64 (cmp, v_f64 (1.0), r); r = vbslq_f64 (cmp, v_f64 (1.0), r);
#else #else
cmp = vcageq_f64 (d->range_val, x); cmp = vcageq_f64 (x, d->range_val);
cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
r = x; r = x;
#endif #endif

View file

@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
special-case handler later. */ special-case handler later. */
r = vbslq_f32 (cmp, v_f32 (1.0f), r); r = vbslq_f32 (cmp, v_f32 (1.0f), r);
#else #else
cmp = vcageq_f32 (d->range_val, x); cmp = vcageq_f32 (x, d->range_val);
cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
r = x; r = x;
#endif #endif

View file

@ -57,7 +57,7 @@ const static struct data
# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
static inline float64x2_t VPCS_ATTR static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{ {
/* If fenv exceptions are to be triggered correctly, fall back to the scalar /* If fenv exceptions are to be triggered correctly, fall back to the scalar
@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
static float64x2_t VPCS_ATTR NOINLINE static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n, special_case (float64x2_t s, float64x2_t y, float64x2_t n,
const struct data *d) const struct data *d)
{ {

View file

@ -25,7 +25,8 @@
static const struct data static const struct data
{ {
float32x4_t poly[5]; float32x4_t poly[5];
float32x4_t shift, log10_2, log2_10_hi, log2_10_lo; float32x4_t log10_2_and_inv, shift;
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh; float32x4_t scale_thresh;
#endif #endif
@ -38,9 +39,9 @@ static const struct data
.poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
.shift = V4 (0x1.8p23f), .shift = V4 (0x1.8p23f),
.log10_2 = V4 (0x1.a934fp+1),
.log2_10_hi = V4 (0x1.344136p-2), /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
.log2_10_lo = V4 (-0x1.ec10cp-27), .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound) .scale_thresh = V4 (ScaleBound)
#endif #endif
@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
uint32x4_t cmp = vcgeq_u32 ( uint32x4_t cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
TinyBound),
Thres);
float32x4_t xm = x; float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow /* If any lanes are special, mask them with 1 and retain a copy of x to allow
special case handler to fix special lanes later. This is only necessary if special case handler to fix special lanes later. This is only necessary if
fenv exceptions are to be triggered correctly. */ fenv exceptions are to be triggered correctly. */
if (__glibc_unlikely (v_any_u32 (cmp))) if (__glibc_unlikely (v_any_u32 (cmp)))
x = vbslq_f32 (cmp, v_f32 (1), x); x = v_zerofy_f32 (x, cmp);
#endif #endif
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2); float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
float32x4_t n = vsubq_f32 (z, d->shift); float32x4_t n = vsubq_f32 (z, d->shift);
float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi); float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
r = vfmsq_f32 (r, n, d->log2_10_lo); r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));

View file

@ -24,6 +24,7 @@
#define IndexMask (N - 1) #define IndexMask (N - 1)
#define BigBound 1022.0 #define BigBound 1022.0
#define UOFlowBound 1280.0 #define UOFlowBound 1280.0
#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
static const struct data static const struct data
{ {
@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i)
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
/* Call scalar exp2 as a fallback. */ /* Call scalar exp2 as a fallback. */
static float64x2_t VPCS_ATTR NOINLINE static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x) special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
{ {
return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff)); return v_call_f64 (exp2, x, y, is_special);
} }
#else #else
@ -65,7 +65,7 @@ special_case (float64x2_t x)
# define SpecialBias1 0x7000000000000000 /* 0x1p769. */ # define SpecialBias1 0x7000000000000000 /* 0x1p769. */
# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
static float64x2_t VPCS_ATTR static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n, special_case (float64x2_t s, float64x2_t y, float64x2_t n,
const struct data *d) const struct data *d)
{ {
@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
/* If any special case (inf, nan, small and large x) is detected, /* Mask special lanes and retain a copy of x for passing to special-case
fall back to scalar for all lanes. */ handler. */
if (__glibc_unlikely (v_any_u64 (cmp))) float64x2_t xc = x;
return special_case (x); x = v_zerofy_f64 (x, cmp);
#else #else
cmp = vcagtq_f64 (x, d->scale_big_bound); cmp = vcagtq_f64 (x, d->scale_big_bound);
#endif #endif
@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
y = vmulq_f64 (r, y); y = vmulq_f64 (r, y);
#if !WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u64 (cmp))) if (__glibc_unlikely (v_any_u64 (cmp)))
#if !WANT_SIMD_EXCEPT
return special_case (s, y, n, d); return special_case (s, y, n, d);
#else
return special_case (xc, vfmaq_f64 (s, s, y), cmp);
#endif #endif
return vfmaq_f64 (s, s, y); return vfmaq_f64 (s, s, y);
} }

View file

@ -20,6 +20,8 @@
#include "sv_math.h" #include "sv_math.h"
#include "poly_sve_f32.h" #include "poly_sve_f32.h"
#define Thres 0x1.5d5e2ap+6f
static const struct data static const struct data
{ {
float poly[5]; float poly[5];
@ -33,7 +35,7 @@ static const struct data
.shift = 0x1.903f8p17f, .shift = 0x1.903f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
correctly by FEXPA. */ correctly by FEXPA. */
.thres = 0x1.5d5e2ap+6f, .thres = Thres,
}; };
static svfloat32_t NOINLINE static svfloat32_t NOINLINE

View file

@ -54,7 +54,7 @@ const static volatile struct
# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
static inline float64x2_t VPCS_ATTR static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{ {
/* If fenv exceptions are to be triggered correctly, fall back to the scalar /* If fenv exceptions are to be triggered correctly, fall back to the scalar
@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
static float64x2_t VPCS_ATTR NOINLINE static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n) special_case (float64x2_t s, float64x2_t y, float64x2_t n)
{ {
/* 2^(n/N) may overflow, break it up into s1*s2. */ /* 2^(n/N) may overflow, break it up into s1*s2. */

View file

@ -23,7 +23,7 @@
static const struct data static const struct data
{ {
float64x2_t poly[11]; float64x2_t poly[11];
float64x2_t invln2, ln2_lo, ln2_hi, shift; float64x2_t invln2, ln2, shift;
int64x2_t exponent_bias; int64x2_t exponent_bias;
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
uint64x2_t thresh, tiny_bound; uint64x2_t thresh, tiny_bound;
@ -38,8 +38,7 @@ static const struct data
V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
.invln2 = V2 (0x1.71547652b82fep0), .invln2 = V2 (0x1.71547652b82fep0),
.ln2_hi = V2 (0x1.62e42fefa39efp-1), .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
.ln2_lo = V2 (0x1.abc9e3b39803fp-56),
.shift = V2 (0x1.8p52), .shift = V2 (0x1.8p52),
.exponent_bias = V2 (0x3ff0000000000000), .exponent_bias = V2 (0x3ff0000000000000),
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
x = v_zerofy_f64 (x, special); x = v_zerofy_f64 (x, special);
#else #else
/* Large input, NaNs and Infs. */ /* Large input, NaNs and Infs. */
uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound)); uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
#endif #endif
/* Reduce argument to smaller range: /* Reduce argument to smaller range:
@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
where 2^i is exact because i is an integer. */ where 2^i is exact because i is an integer. */
float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
int64x2_t i = vcvtq_s64_f64 (n); int64x2_t i = vcvtq_s64_f64 (n);
float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi); float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
f = vfmsq_f64 (f, n, d->ln2_lo); f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
/* Approximate expm1(f) using polynomial. /* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form: Taylor expansion for expm1(x) has the form:

View file

@ -23,7 +23,8 @@
static const struct data static const struct data
{ {
float32x4_t poly[5]; float32x4_t poly[5];
float32x4_t invln2, ln2_lo, ln2_hi, shift; float32x4_t invln2_and_ln2;
float32x4_t shift;
int32x4_t exponent_bias; int32x4_t exponent_bias;
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
uint32x4_t thresh; uint32x4_t thresh;
@ -34,9 +35,8 @@ static const struct data
/* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
.invln2 = V4 (0x1.715476p+0f), /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
.ln2_hi = V4 (0x1.62e4p-1f), .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
.ln2_lo = V4 (0x1.7f7d1cp-20f),
.shift = V4 (0x1.8p23f), .shift = V4 (0x1.8p23f),
.exponent_bias = V4 (0x3f800000), .exponent_bias = V4 (0x3f800000),
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
x = v_zerofy_f32 (x, special); x = v_zerofy_f32 (x, special);
#else #else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound)); uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif #endif
/* Reduce argument to smaller range: /* Reduce argument to smaller range:
@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */ where 2^i is exact because i is an integer. */
float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift); float32x4_t j = vsubq_f32 (
vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
int32x4_t i = vcvtq_s32_f32 (j); int32x4_t i = vcvtq_s32_f32 (j);
float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi); float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
f = vfmsq_f32 (f, j, d->ln2_lo); f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
/* Approximate expm1(f) using polynomial. /* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form: Taylor expansion for expm1(x) has the form:

View file

@ -58,8 +58,13 @@ lookup (uint64x2_t i)
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
#if __BYTE_ORDER == __LITTLE_ENDIAN
e.invc = vuzp1q_f64 (e0, e1); e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1); e.logc = vuzp2q_f64 (e0, e1);
#else
e.invc = vuzp1q_f64 (e1, e0);
e.logc = vuzp2q_f64 (e1, e0);
#endif
return e; return e;
} }

View file

@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
#else #else
r = x; r = x;
cmp = vcageq_f64 (d->range_val, x); cmp = vcageq_f64 (x, d->range_val);
cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
#endif #endif
/* n = rint(|x|/pi). */ /* n = rint(|x|/pi). */

View file

@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
#else #else
r = x; r = x;
cmp = vcageq_f32 (d->range_val, x); cmp = vcageq_f32 (x, d->range_val);
cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
#endif #endif
/* n = rint(|x|/pi) */ /* n = rint(|x|/pi) */

View file

@ -23,7 +23,7 @@
static const struct data static const struct data
{ {
float64x2_t poly[9]; float64x2_t poly[9];
float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift; float64x2_t half_pi, two_over_pi, shift;
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
float64x2_t range_val; float64x2_t range_val;
#endif #endif
@ -34,8 +34,7 @@ static const struct data
V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
V2 (0x1.4e4fd14147622p-12) }, V2 (0x1.4e4fd14147622p-12) },
.half_pi_hi = V2 (0x1.921fb54442d18p0), .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
.half_pi_lo = V2 (0x1.1a62633145c07p-54),
.two_over_pi = V2 (0x1.45f306dc9c883p-1), .two_over_pi = V2 (0x1.45f306dc9c883p-1),
.shift = V2 (0x1.8p52), .shift = V2 (0x1.8p52),
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
@ -56,15 +55,15 @@ special_case (float64x2_t x)
/* Vector approximation for double-precision tan. /* Vector approximation for double-precision tan.
Maximum measured error is 3.48 ULP: Maximum measured error is 3.48 ULP:
__v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
want -0x1.f6ccd8ecf7deap+37. */ want -0x1.f6ccd8ecf7deap+37. */
float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
{ {
const struct data *dat = ptr_barrier (&data); const struct data *dat = ptr_barrier (&data);
/* Our argument reduction cannot calculate q with sufficient accuracy for very /* Our argument reduction cannot calculate q with sufficient accuracy for
large inputs. Fall back to scalar routine for all lanes if any are too very large inputs. Fall back to scalar routine for all lanes if any are
large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
input to avoid underflow. */ tiny input to avoid underflow. */
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
/* iax - tiny_bound > range_val - tiny_bound. */ /* iax - tiny_bound > range_val - tiny_bound. */
@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by: /* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */ r = x - q * pi/2, in extended precision. */
float64x2_t r = x; float64x2_t r = x;
r = vfmsq_f64 (r, q, dat->half_pi_hi); r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
r = vfmsq_f64 (r, q, dat->half_pi_lo); r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */ formula. */
r = vmulq_n_f64 (r, 0.5); r = vmulq_n_f64 (r, 0.5);
@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
and reciprocity around pi/2: and reciprocity around pi/2:
tan(x) = 1 / (tan(pi/2 - x)) tan(x) = 1 / (tan(pi/2 - x))
to assemble result using change-of-sign and conditional selection of to assemble result using change-of-sign and conditional selection of
numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
*/
float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
float64x2_t d = vaddq_f64 (p, p); float64x2_t d = vaddq_f64 (p, p);
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val)); uint64x2_t special = vcageq_f64 (x, dat->range_val);
if (__glibc_unlikely (v_any_u64 (special))) if (__glibc_unlikely (v_any_u64 (special)))
return special_case (x); return special_case (x);
#endif #endif

View file

@ -23,7 +23,8 @@
static const struct data static const struct data
{ {
float32x4_t poly[6]; float32x4_t poly[6];
float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift; float32x4_t pi_consts;
float32x4_t shift;
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
float32x4_t range_val; float32x4_t range_val;
#endif #endif
@ -31,10 +32,9 @@ static const struct data
/* Coefficients generated using FPMinimax. */ /* Coefficients generated using FPMinimax. */
.poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
.neg_half_pi_1 = V4 (-0x1.921fb6p+0f), /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
.neg_half_pi_2 = V4 (0x1.777a5cp-25f), .pi_consts
.neg_half_pi_3 = V4 (0x1.ee59dap-50f), = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
.two_over_pi = V4 (0x1.45f306p-1f),
.shift = V4 (0x1.8p+23f), .shift = V4 (0x1.8p+23f),
#if !WANT_SIMD_EXCEPT #if !WANT_SIMD_EXCEPT
.range_val = V4 (0x1p15f), .range_val = V4 (0x1p15f),
@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d)
{ {
float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z2 = vmulq_f32 (z, z);
#if WANT_SIMD_EXCEPT #if WANT_SIMD_EXCEPT
/* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
are to be triggered correctly, sidestep this by fixing such lanes to 0. */ If fp exceptions are to be triggered correctly,
sidestep this by fixing such lanes to 0. */
uint32x4_t will_uflow uint32x4_t will_uflow
= vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
if (__glibc_unlikely (v_any_u32 (will_uflow))) if (__glibc_unlikely (v_any_u32 (will_uflow)))
z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
#endif #endif
@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
#endif #endif
/* n = rint(x/(pi/2)). */ /* n = rint(x/(pi/2)). */
float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x); float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
float32x4_t n = vsubq_f32 (q, d->shift); float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
float32x4_t r; float32x4_t r;
r = vfmaq_f32 (x, d->neg_half_pi_1, n); r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
r = vfmaq_f32 (r, d->neg_half_pi_2, n); r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
r = vfmaq_f32 (r, d->neg_half_pi_3, n); r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
/* If x lives in an interval, where |tan(x)| /* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form - is finite, then use a polynomial approximation of the form