From 3b1c5a539b7b8cb833f22012d1a95a4847594747 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Mon, 11 Nov 2024 17:38:44 -0300
Subject: [PATCH] math: Add internal roundeven_finite

Some CORE-MATH routines uses roundeven and most of ISA do not have
an specific instruction for the operation.  In this case, the call
will be routed to generic implementation.

However, if the ISA does support round() and ctz() there is a better
alternative (as used by CORE-MATH).

This patch adds such optimization and also enables it on powerpc.
On a power10 it shows the following improvement:

expm1f                      master      patched       improvement
latency                     9.8574       7.0139            28.85%
reciprocal-throughput       4.3742       2.6592            39.21%

Checked on powerpc64le-linux-gnu and aarch64-linux-gnu.

Reviewed-by: DJ Delorie <dj@redhat.com>
---
 sysdeps/ieee754/flt-32/e_gammaf_r.c  |  2 +-
 sysdeps/ieee754/flt-32/math_config.h | 27 +++++++++++++++++++++++++++
 sysdeps/ieee754/flt-32/s_expm1f.c    |  2 +-
 sysdeps/ieee754/flt-32/s_tanf.c      |  2 +-
 sysdeps/powerpc/fpu/math_private.h   |  5 +++++
 5 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/sysdeps/ieee754/flt-32/e_gammaf_r.c b/sysdeps/ieee754/flt-32/e_gammaf_r.c
index 6b1f95d50f..66e8caee0b 100644
--- a/sysdeps/ieee754/flt-32/e_gammaf_r.c
+++ b/sysdeps/ieee754/flt-32/e_gammaf_r.c
@@ -140,7 +140,7 @@ __ieee754_gammaf_r (float x, int *signgamp)
    };
 
   double m = z - 0x1.7p+1;
-  double i = roundeven (m);
+  double i = roundeven_finite (m);
   double step = copysign (1.0, i);
   double d = m - i, d2 = d * d, d4 = d2 * d2, d8 = d4 * d4;
   double f = (c[0] + d * c[1]) + d2 * (c[2] + d * c[3])
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
index dc07ebd459..b30a03eeb4 100644
--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -57,6 +57,33 @@ static inline int32_t
 converttoint (double_t x);
 #endif
 
+#ifndef ROUNDEVEN_INTRINSICS
+/* When set, roundeven_finite will route to the internal roundeven function.  */
+# define ROUNDEVEN_INTRINSICS 1
+#endif
+
+/* Round x to nearest integer value in floating-point format, rounding halfway
+  cases to even.  If the input is non finite the result is unspecified.  */
+static inline double
+roundeven_finite (double x)
+{
+  if (!isfinite (x))
+    __builtin_unreachable ();
+#if ROUNDEVEN_INTRINSICS
+  return roundeven (x);
+#else
+  double y = round (x);
+  if (fabs (x - y) == 0.5)
+    {
+      union { double f; uint64_t i; } u = {y};
+      union { double f; uint64_t i; } v = {y - copysign (1.0, x)};
+      if (__builtin_ctzll (v.i) > __builtin_ctzll (u.i))
+        y = v.f;
+    }
+  return y;
+#endif
+}
+
 static inline uint32_t
 asuint (float f)
 {
diff --git a/sysdeps/ieee754/flt-32/s_expm1f.c b/sysdeps/ieee754/flt-32/s_expm1f.c
index edd7c9acf8..a36e5781f5 100644
--- a/sysdeps/ieee754/flt-32/s_expm1f.c
+++ b/sysdeps/ieee754/flt-32/s_expm1f.c
@@ -95,7 +95,7 @@ __expm1f (float x)
       return __math_oflowf (0);
     }
   double a = iln2 * z;
-  double ia = roundeven (a);
+  double ia = roundeven_finite (a);
   double h = a - ia;
   double h2 = h * h;
   uint64_t u = asuint64 (ia + big);
diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c
index ff63e72d6b..dfe56fc2a0 100644
--- a/sysdeps/ieee754/flt-32/s_tanf.c
+++ b/sysdeps/ieee754/flt-32/s_tanf.c
@@ -38,7 +38,7 @@ rltl (float z, int *q)
   double x = z;
   double idl = -0x1.b1bbead603d8bp-32 * x;
   double idh = 0x1.45f306ep-1 * x;
-  double id = roundeven (idh);
+  double id = roundeven_finite (idh);
   *q = (int64_t) id;
   return (idh - id) + idl;
 }
diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
index 9ef35b20cd..b22f53d366 100644
--- a/sysdeps/powerpc/fpu/math_private.h
+++ b/sysdeps/powerpc/fpu/math_private.h
@@ -59,4 +59,9 @@ __ieee754_sqrtf128 (_Float128 __x)
 #define _GL_HAS_BUILTIN_ILOGB 0
 #endif
 
+#ifdef _ARCH_PWR6
+/* ISA 2.03 provides frin/round() and cntlzw/ctznll().  */
+# define ROUNDEVEN_INTRINSICS 0
+#endif
+
 #endif /* _PPC_MATH_PRIVATE_H_ */