AArch64: Improve codegen in SVE F32 logs

Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
where possible.  Similar to the recent change to AdvSIMD F32 logs,
adjust special-case arguments and bounds to allow for more optimal
register usage.  For all 3 routines one MOVPRFX remains in the
reduction, which cannot be avoided as immediate AND and ASR are both
destructive.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Joe Ramsay 2024-09-23 15:30:20 +01:00 committed by Wilco Dijkstra
parent 7b8c134b54
commit a15b1394b5
3 changed files with 69 additions and 47 deletions

View file

@ -24,6 +24,7 @@ static const struct data
float poly_0246[4]; float poly_0246[4];
float poly_1357[4]; float poly_1357[4];
float ln2, inv_ln10; float ln2, inv_ln10;
uint32_t off, lower;
} data = { } data = {
.poly_1357 = { .poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@ -35,18 +36,23 @@ static const struct data
-0x1.0fc92cp-4f }, -0x1.0fc92cp-4f },
.ln2 = 0x1.62e43p-1f, .ln2 = 0x1.62e43p-1f,
.inv_ln10 = 0x1.bcb7b2p-2f, .inv_ln10 = 0x1.bcb7b2p-2f,
.off = 0x3f2aaaab,
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.lower = 0x00800000 - 0x3f2aaaab
}; };
#define Min 0x00800000 #define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
#define Max 0x7f800000
#define Thres 0x7f000000 /* Max - Min. */
#define Offset 0x3f2aaaab /* 0.666667. */
#define MantissaMask 0x007fffff #define MantissaMask 0x007fffff
static svfloat32_t NOINLINE static svfloat32_t NOINLINE
special_case (svfloat32_t x, svfloat32_t y, svbool_t special) special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
svbool_t cmp)
{ {
return sv_call_f32 (log10f, x, y, special); return sv_call_f32 (
log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
svmla_x (svptrue_b32 (), p, r2, y), cmp);
} }
/* Optimised implementation of SVE log10f using the same algorithm and /* Optimised implementation of SVE log10f using the same algorithm and
@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
svuint32_t ix = svreinterpret_u32 (x);
svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); svuint32_t u_off = svreinterpret_u32 (x);
u_off = svsub_x (pg, u_off, d->off);
svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
ix = svsub_x (pg, ix, Offset);
svfloat32_t n = svcvt_f32_x ( svfloat32_t n = svcvt_f32_x (
pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
ix = svand_x (pg, ix, MantissaMask); svuint32_t ix = svand_x (pg, u_off, MantissaMask);
ix = svadd_x (pg, ix, Offset); ix = svadd_x (pg, ix, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
/* y = log10(1+r) + n*log10(2) /* y = log10(1+r) + n*log10(2)
log10(1+r) ~ r * InvLn(10) + P(r) log10(1+r) ~ r * InvLn(10) + P(r)
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
svfloat32_t r2 = svmul_x (pg, r, r); svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t r4 = svmul_x (pg, r2, r2); svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
hi = svmul_x (pg, hi, d->inv_ln10); hi = svmul_x (pg, hi, d->inv_ln10);
if (__glibc_unlikely (svptest_any (pg, special))) if (__glibc_unlikely (svptest_any (pg, special)))
return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), return special_case (u_off, hi, r2, y, special);
special); return svmla_x (svptrue_b32 (), hi, r2, y);
return svmla_x (pg, hi, r2, y);
} }

View file

@ -23,6 +23,7 @@ static const struct data
{ {
float poly_02468[5]; float poly_02468[5];
float poly_1357[4]; float poly_1357[4];
uint32_t off, lower;
} data = { } data = {
.poly_1357 = { .poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@ -32,18 +33,23 @@ static const struct data
}, },
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
0x1.9d8ecap-3f, 0x1.9e495p-3f }, 0x1.9d8ecap-3f, 0x1.9e495p-3f },
.off = 0x3f2aaaab,
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.lower = 0x00800000 - 0x3f2aaaab
}; };
#define Min (0x00800000) #define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define Max (0x7f800000)
#define Thres (0x7f000000) /* Max - Min. */
#define MantissaMask (0x007fffff) #define MantissaMask (0x007fffff)
#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE static svfloat32_t NOINLINE
special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
svbool_t cmp)
{ {
return sv_call_f32 (log2f, x, y, cmp); return sv_call_f32 (
log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
svmla_x (svptrue_b32 (), p, r2, y), cmp);
} }
/* Optimised implementation of SVE log2f, using the same algorithm /* Optimised implementation of SVE log2f, using the same algorithm
@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
svuint32_t u = svreinterpret_u32 (x); svuint32_t u_off = svreinterpret_u32 (x);
svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
u_off = svsub_x (pg, u_off, d->off);
svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x ( svfloat32_t n = svcvt_f32_x (
pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
u = svand_x (pg, u, MantissaMask); svuint32_t u = svand_x (pg, u_off, MantissaMask);
u = svadd_x (pg, u, Off); u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log2(1+r) + n. */ /* y = log2(1+r) + n. */
svfloat32_t r2 = svmul_x (pg, r, r); svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* Evaluate polynomial using pairwise Horner scheme. */ /* Evaluate polynomial using pairwise Horner scheme. */
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
y = svmla_x (pg, q_01, r2, y); y = svmla_x (pg, q_01, r2, y);
if (__glibc_unlikely (svptest_any (pg, special))) if (__glibc_unlikely (svptest_any (pg, special)))
return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); return special_case (u_off, n, r, y, special);
return svmla_x (pg, n, r, y); return svmla_x (svptrue_b32 (), n, r, y);
} }

View file

@ -24,6 +24,7 @@ static const struct data
float poly_0135[4]; float poly_0135[4];
float poly_246[3]; float poly_246[3];
float ln2; float ln2;
uint32_t off, lower;
} data = { } data = {
.poly_0135 = { .poly_0135 = {
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
@ -32,19 +33,24 @@ static const struct data
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
}, },
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
.ln2 = 0x1.62e43p-1f .ln2 = 0x1.62e43p-1f,
.off = 0x3f2aaaab,
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.lower = 0x00800000 - 0x3f2aaaab
}; };
#define Min (0x00800000) #define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define Max (0x7f800000)
#define Thresh (0x7f000000) /* Max - Min. */
#define Mask (0x007fffff) #define Mask (0x007fffff)
#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE static svfloat32_t NOINLINE
special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
svbool_t cmp)
{ {
return sv_call_f32 (logf, x, y, cmp); return sv_call_f32 (
logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
svmla_x (svptrue_b32 (), p, r2, y), cmp);
} }
/* Optimised implementation of SVE logf, using the same algorithm and /* Optimised implementation of SVE logf, using the same algorithm and
@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
svuint32_t u = svreinterpret_u32 (x); svuint32_t u_off = svreinterpret_u32 (x);
svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
u_off = svsub_x (pg, u_off, d->off);
svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x ( svfloat32_t n = svcvt_f32_x (
pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
u = svand_x (pg, u, Mask);
u = svadd_x (pg, u, Off); svuint32_t u = svand_x (pg, u_off, Mask);
u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */ /* y = log(1+r) + n*ln2. */
svfloat32_t r2 = svmul_x (pg, r, r); svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
p = svmla_x (pg, r, n, d->ln2); p = svmla_x (pg, r, n, d->ln2);
if (__glibc_unlikely (svptest_any (pg, cmp))) if (__glibc_unlikely (svptest_any (pg, cmp)))
return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); return special_case (u_off, p, r2, y, cmp);
return svmla_x (pg, p, r2, y); return svmla_x (pg, p, r2, y);
} }