AArch64: Improve codegen of AdvSIMD atan(2)(f)

Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
8% improvement in throughput microbenchmark on Neoverse V1.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Joana Cruz 2024-12-17 14:49:30 +00:00 committed by Wilco Dijkstra
parent d6e034f5b2
commit 6914774b9d
3 changed files with 160 additions and 68 deletions

View file

@ -23,40 +23,57 @@
static const struct data static const struct data
{ {
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2; float64x2_t pi_over_2;
float64x2_t poly[20]; double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
uint64x2_t zeroinfnan, minustwo;
} data = { } data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
the interval [2**-1022, 1.0]. */ [2**-1022, 1.0]. */
.poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), .c0 = V2 (-0x1.5555555555555p-2),
V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), .c1 = 0x1.99999999996c1p-3,
V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), .c2 = V2 (-0x1.2492492478f88p-3),
V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), .c3 = 0x1.c71c71bc3951cp-4,
V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), .c4 = V2 (-0x1.745d160a7e368p-4),
V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), .c5 = 0x1.3b139b6a88ba1p-4,
V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), .c6 = V2 (-0x1.11100ee084227p-4),
V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), .c7 = 0x1.e1d0f9696f63bp-5,
V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), .c8 = V2 (-0x1.aebfe7b418581p-5),
V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, .c9 = 0x1.842dbe9b0d916p-5,
.c10 = V2 (-0x1.5d30140ae5e99p-5),
.c11 = 0x1.338e31eb2fbbcp-5,
.c12 = V2 (-0x1.00e6eece7de8p-5),
.c13 = 0x1.860897b29e5efp-6,
.c14 = V2 (-0x1.0051381722a59p-6),
.c15 = 0x1.14e9dc19a4a4ep-7,
.c16 = V2 (-0x1.d0062b42fe3bfp-9),
.c17 = 0x1.17739e210171ap-10,
.c18 = V2 (-0x1.ab24da7be7402p-13),
.c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0), .pi_over_2 = V2 (0x1.921fb54442d18p+0),
.zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
.minustwo = V2 (0xc000000000000000),
}; };
#define SignMask v_u64 (0x8000000000000000) #define SignMask v_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
uint64x2_t sign_xy, uint64x2_t cmp)
{ {
/* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
return v_call2_f64 (atan2, y, x, ret, cmp); return v_call2_f64 (atan2, y, x, ret, cmp);
} }
/* Returns 1 if input is the bit representation of 0, infinity or nan. */ /* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint64x2_t static inline uint64x2_t
zeroinfnan (uint64x2_t i) zeroinfnan (uint64x2_t i, const struct data *d)
{ {
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
v_u64 (2 * asuint64 (INFINITY) - 1));
} }
/* Fast implementation of vector atan2. /* Fast implementation of vector atan2.
@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i)
want 0x1.92d628ab678cfp-1. */ want 0x1.92d628ab678cfp-1. */
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
{ {
const struct data *data_ptr = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x); uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t iy = vreinterpretq_u64_f64 (y); uint64x2_t iy = vreinterpretq_u64_f64 (y);
uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); uint64x2_t special_cases
= vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint64x2_t sign_x = vandq_u64 (ix, SignMask); uint64x2_t sign_x = vandq_u64 (ix, SignMask);
uint64x2_t sign_y = vandq_u64 (iy, SignMask); uint64x2_t sign_y = vandq_u64 (iy, SignMask);
@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t ay = vabsq_f64 (y); float64x2_t ay = vabsq_f64 (y);
uint64x2_t pred_xlt0 = vcltzq_f64 (x); uint64x2_t pred_xlt0 = vcltzq_f64 (x);
uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
/* Set up z for call to atan. */ /* Set up z for call to atan. */
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
float64x2_t z = vdivq_f64 (n, d); float64x2_t z = vdivq_f64 (n, q);
/* Work out the correct shift. */ /* Work out the correct shift. */
float64x2_t shift = vreinterpretq_f64_u64 ( float64x2_t shift
vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
shift = vmulq_f64 (shift, data_ptr->pi_over_2); shift = vmulq_f64 (shift, d->pi_over_2);
/* Calculate the polynomial approximation. /* Calculate the polynomial approximation.
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4); float64x2_t x8 = vmulq_f64 (x4, x4);
float64x2_t ret
= vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), float64x2_t c13 = vld1q_f64 (&d->c1);
v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t c1719 = vld1q_f64 (&d->c17);
/* estrin_7. */
float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
/* estrin_11. */
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
float64x2_t ret = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */ /* Finalize. y = shift + z + z^3 * P(z^2). */
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
ret = vaddq_f64 (ret, shift); ret = vaddq_f64 (ret, shift);
if (__glibc_unlikely (v_any_u64 (special_cases)))
return special_case (y, x, ret, sign_xy, special_cases);
/* Account for the sign of x and y. */ /* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 ( ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
if (__glibc_unlikely (v_any_u64 (special_cases)))
return special_case (y, x, ret, special_cases);
return ret; return ret;
} }

View file

@ -22,34 +22,39 @@
static const struct data static const struct data
{ {
float32x4_t poly[8]; float32x4_t c0, pi_over_2, c4, c6, c2;
float32x4_t pi_over_2; float c1, c3, c5, c7;
uint32x4_t comp_const;
} data = { } data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-128, 1.0]. [2**-128, 1.0].
Generated using fpminimax between FLT_MIN and 1. */ Generated using fpminimax between FLT_MIN and 1. */
.poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
.pi_over_2 = V4 (0x1.921fb6p+0f), .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
.pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
}; };
#define SignMask v_u32 (0x80000000) #define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
uint32x4_t sign_xy, uint32x4_t cmp)
{ {
/* Account for the sign of y. */
ret = vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
return v_call2_f32 (atan2f, y, x, ret, cmp); return v_call2_f32 (atan2f, y, x, ret, cmp);
} }
/* Returns 1 if input is the bit representation of 0, infinity or nan. */ /* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint32x4_t static inline uint32x4_t
zeroinfnan (uint32x4_t i) zeroinfnan (uint32x4_t i, const struct data *d)
{ {
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
v_u32 (2 * 0x7f800000lu - 1));
} }
/* Fast implementation of vector atan2f. Maximum observed error is /* Fast implementation of vector atan2f. Maximum observed error is
@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
want 0x1.967f00p-1. */ want 0x1.967f00p-1. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{ {
const struct data *data_ptr = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t iy = vreinterpretq_u32_f32 (y); uint32x4_t iy = vreinterpretq_u32_f32 (y);
uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); uint32x4_t special_cases
= vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint32x4_t sign_x = vandq_u32 (ix, SignMask); uint32x4_t sign_x = vandq_u32 (ix, SignMask);
uint32x4_t sign_y = vandq_u32 (iy, SignMask); uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
/* Set up z for call to atanf. */ /* Set up z for call to atanf. */
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
float32x4_t z = vdivq_f32 (n, d); float32x4_t z = vdivq_f32 (n, q);
/* Work out the correct shift. */ /* Work out the correct shift. */
float32x4_t shift = vreinterpretq_f32_u32 ( float32x4_t shift = vreinterpretq_f32_u32 (
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
shift = vmulq_f32 (shift, data_ptr->pi_over_2); shift = vmulq_f32 (shift, d->pi_over_2);
/* Calculate the polynomial approximation. /* Calculate the polynomial approximation.
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z4 = vmulq_f32 (z2, z2); float32x4_t z4 = vmulq_f32 (z2, z2);
float32x4_t ret = vfmaq_f32 ( float32x4_t c1357 = vld1q_f32 (&d->c1);
v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
/* y = shift + z * P(z^2). */ /* y = shift + z * P(z^2). */
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
/* Account for the sign of y. */
ret = vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
if (__glibc_unlikely (v_any_u32 (special_cases))) if (__glibc_unlikely (v_any_u32 (special_cases)))
{ {
return special_case (y, x, ret, special_cases); return special_case (y, x, ret, sign_xy, special_cases);
} }
return ret; /* Account for the sign of y. */
return vreinterpretq_f32_u32 (
veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
} }
libmvec_hidden_def (V_NAME_F2 (atan2)) libmvec_hidden_def (V_NAME_F2 (atan2))
HALF_WIDTH_ALIAS_F2(atan2) HALF_WIDTH_ALIAS_F2(atan2)

View file

@ -22,21 +22,22 @@
static const struct data static const struct data
{ {
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2; float64x2_t pi_over_2;
float64x2_t poly[20]; double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
} data = { } data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */ [2**-1022, 1.0]. */
.poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0), .pi_over_2 = V2 (0x1.921fb54442d18p+0),
}; };
@ -52,6 +53,11 @@ static const struct data
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
float64x2_t c13 = vld1q_f64 (&d->c1);
float64x2_t c57 = vld1q_f64 (&d->c5);
float64x2_t c911 = vld1q_f64 (&d->c9);
float64x2_t c1315 = vld1q_f64 (&d->c13);
float64x2_t c1719 = vld1q_f64 (&d->c17);
/* Small cases, infs and nans are supported by our approximation technique, /* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need but do not set fenv flags correctly. Only trigger special case if we need
@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4); float64x2_t x8 = vmulq_f64 (x4, x4);
float64x2_t y
= vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), /* estrin_7. */
v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
/* estrin_11. */
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
float64x2_t y = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */ /* Finalize. y = shift + z + z^3 * P(z^2). */
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));