aarch64: Cosmetic change in SVE exp routines

Use overloaded intrinsics for readability. Codegen does not
change, however while we're bringing the routines up-to-date with
recent improvements to other routines in AOR it is worth copying
this change over as well.
This commit is contained in:
Joe Ramsay 2023-10-04 10:40:04 +01:00 committed by Szabolcs Nagy
parent 9180160e08
commit 480a0dfe1a
2 changed files with 44 additions and 47 deletions

View file

@ -52,26 +52,26 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
and s1*s1 overflows only if n>0. */ and s1*s1 overflows only if n>0. */
/* If n<=0 then set b to 0x6, 0 otherwise. */ /* If n<=0 then set b to 0x6, 0 otherwise. */
svbool_t p_sign = svcmple_n_f64 (pg, n, 0.0); /* n <= 0. */ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
svuint64_t b svuint64_t b
= svdup_n_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
/* Set s1 to generate overflow depending on sign of exponent n. */ /* Set s1 to generate overflow depending on sign of exponent n. */
svfloat64_t s1 = svreinterpret_f64_u64 ( svfloat64_t s1 = svreinterpret_f64 (
svsubr_n_u64_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
/* Offset s to avoid overflow in final result if n is below threshold. */ /* Offset s to avoid overflow in final result if n is below threshold. */
svfloat64_t s2 = svreinterpret_f64_u64 (svadd_u64_x ( svfloat64_t s2 = svreinterpret_f64 (
pg, svsub_n_u64_x (pg, svreinterpret_u64_f64 (s), SpecialBias2), svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
b)); /* as_u64 (s) - 0x3010...0 + b. */ b)); /* as_u64 (s) - 0x3010...0 + b. */
/* |n| > 1280 => 2^(n) overflows. */ /* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt_n_f64 (pg, n, 1280.0); svbool_t p_cmp = svacgt (pg, n, 1280.0);
svfloat64_t r1 = svmul_f64_x (pg, s1, s1); svfloat64_t r1 = svmul_x (pg, s1, s1);
svfloat64_t r2 = svmla_f64_x (pg, s2, s2, y); svfloat64_t r2 = svmla_x (pg, s2, s2, y);
svfloat64_t r0 = svmul_f64_x (pg, r2, s1); svfloat64_t r0 = svmul_x (pg, r2, s1);
return svsel_f64 (p_cmp, r1, r0); return svsel (p_cmp, r1, r0);
} }
/* SVE exp algorithm. Maximum measured error is 1.01ulps: /* SVE exp algorithm. Maximum measured error is 1.01ulps:
@ -81,7 +81,7 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
svbool_t special = svacgt_n_f64 (pg, x, d->thres); svbool_t special = svacgt (pg, x, d->thres);
/* Use a modifed version of the shift used for flooring, such that x/ln2 is /* Use a modifed version of the shift used for flooring, such that x/ln2 is
rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 * rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 *
@ -100,26 +100,26 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
We add 1023 to the modified shift value in order to set bits 16:6 of u to We add 1023 to the modified shift value in order to set bits 16:6 of u to
1, such that once these bits are moved to the exponent of the output of 1, such that once these bits are moved to the exponent of the output of
FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */ FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */
svfloat64_t z = svmla_n_f64_x (pg, sv_f64 (d->shift), x, d->inv_ln2); svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
svuint64_t u = svreinterpret_u64_f64 (z); svuint64_t u = svreinterpret_u64 (z);
svfloat64_t n = svsub_n_f64_x (pg, z, d->shift); svfloat64_t n = svsub_x (pg, z, d->shift);
/* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
svfloat64_t ln2 = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
svfloat64_t r = svmls_lane_f64 (x, n, ln2, 0); svfloat64_t r = svmls_lane (x, n, ln2, 0);
r = svmls_lane_f64 (r, n, ln2, 1); r = svmls_lane (r, n, ln2, 1);
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
svfloat64_t r2 = svmul_f64_x (pg, r, r); svfloat64_t r2 = svmul_x (pg, r, r);
svfloat64_t p01 = svmla_f64_x (pg, C (0), C (1), r); svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
svfloat64_t p23 = svmla_f64_x (pg, C (2), C (3), r); svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
svfloat64_t p04 = svmla_f64_x (pg, p01, p23, r2); svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
svfloat64_t y = svmla_f64_x (pg, r, p04, r2); svfloat64_t y = svmla_x (pg, r, p04, r2);
/* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for
consistent NaN handling we have to manually propagate them. This comes at consistent NaN handling we have to manually propagate them. This comes at
significant performance cost. */ significant performance cost. */
svfloat64_t s = svexpa_f64 (u); svfloat64_t s = svexpa (u);
/* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the /* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the
multiplication may overflow, so use special case routine. */ multiplication may overflow, so use special case routine. */
@ -129,14 +129,12 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
/* FEXPA zeroes the sign bit, however the sign is meaningful to the /* FEXPA zeroes the sign bit, however the sign is meaningful to the
special case function so needs to be copied. special case function so needs to be copied.
e = sign bit of u << 46. */ e = sign bit of u << 46. */
svuint64_t e svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
= svand_n_u64_x (pg, svlsl_n_u64_x (pg, u, 46), 0x8000000000000000);
/* Copy sign to s. */ /* Copy sign to s. */
s = svreinterpret_f64_u64 ( s = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (s)));
svadd_u64_x (pg, e, svreinterpret_u64_f64 (s)));
return special_case (pg, s, y, n); return special_case (pg, s, y, n);
} }
/* No special case. */ /* No special case. */
return svmla_f64_x (pg, s, s, y); return svmla_x (pg, s, s, y);
} }

View file

@ -60,31 +60,30 @@ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
/* Load some constants in quad-word chunks to minimise memory access (last /* Load some constants in quad-word chunks to minimise memory access (last
lane is wasted). */ lane is wasted). */
svfloat32_t invln2_and_ln2 = svld1rq_f32 (svptrue_b32 (), &d->inv_ln2); svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
/* n = round(x/(ln2/N)). */ /* n = round(x/(ln2/N)). */
svfloat32_t z = svmla_lane_f32 (sv_f32 (d->shift), x, invln2_and_ln2, 0); svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
svfloat32_t n = svsub_n_f32_x (pg, z, d->shift); svfloat32_t n = svsub_x (pg, z, d->shift);
/* r = x - n*ln2/N. */ /* r = x - n*ln2/N. */
svfloat32_t r = svmls_lane_f32 (x, n, invln2_and_ln2, 1); svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
r = svmls_lane_f32 (r, n, invln2_and_ln2, 2); r = svmls_lane (r, n, invln2_and_ln2, 2);
/* scale = 2^(n/N). */ /* scale = 2^(n/N). */
svbool_t is_special_case = svacgt_n_f32 (pg, x, d->thres); svbool_t is_special_case = svacgt (pg, x, d->thres);
svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z)); svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
svfloat32_t p12 = svmla_f32_x (pg, C (1), C (2), r); svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
svfloat32_t p34 = svmla_f32_x (pg, C (3), C (4), r); svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
svfloat32_t r2 = svmul_f32_x (pg, r, r); svfloat32_t r2 = svmul_x (pg, r, r);
svfloat32_t p14 = svmla_f32_x (pg, p12, p34, r2); svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); svfloat32_t p0 = svmul_x (pg, r, C (0));
svfloat32_t poly = svmla_f32_x (pg, p0, r2, p14); svfloat32_t poly = svmla_x (pg, p0, r2, p14);
if (__glibc_unlikely (svptest_any (pg, is_special_case))) if (__glibc_unlikely (svptest_any (pg, is_special_case)))
return special_case (x, svmla_f32_x (pg, scale, scale, poly), return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
is_special_case);
return svmla_f32_x (pg, scale, scale, poly); return svmla_x (pg, scale, scale, poly);
} }