glibc/sysdeps/ieee754/flt-32/math_config.h
Adhemerval Zanella Netto cf9cf33199 math: Improve fmodf
This uses a new algorithm similar to already proposed earlier [1].
With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers),
the simplest implementation is:

   mx * 2^ex == 2 * mx * 2^(ex - 1)

   while (ex > ey)
     {
       mx *= 2;
       --ex;
       mx %= my;
     }

With mx/my being mantissa of double floating pointer, on each step the
argument reduction can be improved 8 (which is sizeof of uint32_t minus
MANTISSA_WIDTH plus the signal bit):

   while (ex > ey)
     {
       mx << 8;
       ex -= 8;
       mx %= my;
     }  */

The implementation uses builtin clz and ctz, along with shifts to
convert hx/hy back to doubles.  Different than the original patch,
this path assume modulo/divide operation is slow, so use multiplication
with invert values.

I see the following performance improvements using fmod benchtests
(result only show the 'mean' result):

  Architecture     | Input           | master   | patch
  -----------------|-----------------|----------|--------
  x86_64 (Ryzen 9) | subnormals      | 17.2549  | 12.0318
  x86_64 (Ryzen 9) | normal          | 85.4096  | 49.9641
  x86_64 (Ryzen 9) | close-exponents | 19.1072  | 15.8224
  aarch64 (N1)     | subnormal       | 10.2182  | 6.81778
  aarch64 (N1)     | normal          | 60.0616  | 20.3667
  aarch64 (N1)     | close-exponents | 11.5256  | 8.39685

I also see similar improvements on arm-linux-gnueabihf when running on
the N1 aarch64 chips, where it a lot of soft-fp implementation (for
modulo, and multiplication):

  Architecture     | Input           | master   | patch
  -----------------|-----------------|----------|--------
  armhf (N1)       | subnormal       | 11.6662  | 10.8955
  armhf (N1)       | normal          | 69.2759  | 34.1524
  armhf (N1)       | close-exponents | 13.6472  | 18.2131

Instead of using the math_private.h definitions, I used the
math_config.h instead which is used on newer math implementations.

Co-authored-by: kirill <kirill.okhotnikov@gmail.com>

[1] https://sourceware.org/pipermail/libc-alpha/2020-November/119794.html
Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
2023-04-03 16:45:18 -03:00

215 lines
5.1 KiB
C

/* Configuration for math routines.
Copyright (C) 2017-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _MATH_CONFIG_H
#define _MATH_CONFIG_H
#include <math.h>
#include <math_private.h>
#include <nan-high-order-bit.h>
#include <stdint.h>
#ifndef WANT_ROUNDING
/* Correct special case results in non-nearest rounding modes. */
# define WANT_ROUNDING 1
#endif
#ifndef WANT_ERRNO
/* Set errno according to ISO C with (math_errhandling & MATH_ERRNO) != 0. */
# define WANT_ERRNO 1
#endif
#ifndef WANT_ERRNO_UFLOW
/* Set errno to ERANGE if result underflows to 0 (in all rounding modes). */
# define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO)
#endif
#ifndef TOINT_INTRINSICS
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
# define TOINT_INTRINSICS 0
#endif
#if TOINT_INTRINSICS
/* Round x to nearest int in all rounding modes, ties have to be rounded
consistently with converttoint so the results match. If the result
would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */
static inline double_t
roundtoint (double_t x);
/* Convert x to nearest int in all rounding modes, ties have to be rounded
consistently with roundtoint. If the result is not representible in an
int32_t then the semantics is unspecified. */
static inline int32_t
converttoint (double_t x);
#endif
static inline uint32_t
asuint (float f)
{
union
{
float f;
uint32_t i;
} u = {f};
return u.i;
}
static inline float
asfloat (uint32_t i)
{
union
{
uint32_t i;
float f;
} u = {i};
return u.f;
}
static inline uint64_t
asuint64 (double f)
{
union
{
double f;
uint64_t i;
} u = {f};
return u.i;
}
static inline double
asdouble (uint64_t i)
{
union
{
uint64_t i;
double f;
} u = {i};
return u.f;
}
static inline int
issignalingf_inline (float x)
{
uint32_t ix = asuint (x);
if (HIGH_ORDER_BIT_IS_SET_FOR_SNAN)
return (ix & 0x7fc00000) == 0x7fc00000;
return 2 * (ix ^ 0x00400000) > 2 * 0x7fc00000UL;
}
#define BIT_WIDTH 32
#define MANTISSA_WIDTH 23
#define EXPONENT_WIDTH 8
#define MANTISSA_MASK 0x007fffff
#define EXPONENT_MASK 0x7f800000
#define EXP_MANT_MASK 0x7fffffff
#define QUIET_NAN_MASK 0x00400000
#define SIGN_MASK 0x80000000
static inline bool
is_nan (uint32_t x)
{
return (x & EXP_MANT_MASK) > EXPONENT_MASK;
}
static inline uint32_t
get_mantissa (uint32_t x)
{
return x & MANTISSA_MASK;
}
/* Convert integer number X, unbiased exponent EP, and sign S to double:
result = X * 2^(EP+1 - exponent_bias)
NB: zero is not supported. */
static inline double
make_float (uint32_t x, int ep, uint32_t s)
{
int lz = __builtin_clz (x) - EXPONENT_WIDTH;
x <<= lz;
ep -= lz;
if (__glibc_unlikely (ep < 0 || x == 0))
{
x >>= -ep;
ep = 0;
}
return asfloat (s + x + (ep << MANTISSA_WIDTH));
}
#define NOINLINE __attribute__ ((noinline))
attribute_hidden float __math_oflowf (uint32_t);
attribute_hidden float __math_uflowf (uint32_t);
attribute_hidden float __math_may_uflowf (uint32_t);
attribute_hidden float __math_divzerof (uint32_t);
attribute_hidden float __math_invalidf (float);
/* Shared between expf, exp2f, exp10f, and powf. */
#define EXP2F_TABLE_BITS 5
#define EXP2F_POLY_ORDER 3
extern const struct exp2f_data
{
uint64_t tab[1 << EXP2F_TABLE_BITS];
double shift_scaled;
double poly[EXP2F_POLY_ORDER];
double shift;
double invln2_scaled;
double poly_scaled[EXP2F_POLY_ORDER];
} __exp2f_data attribute_hidden;
#define LOGF_TABLE_BITS 4
#define LOGF_POLY_ORDER 4
extern const struct logf_data
{
struct
{
double invc, logc;
} tab[1 << LOGF_TABLE_BITS];
double ln2;
double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */
} __logf_data attribute_hidden;
#define LOG2F_TABLE_BITS 4
#define LOG2F_POLY_ORDER 4
extern const struct log2f_data
{
struct
{
double invc, logc;
} tab[1 << LOG2F_TABLE_BITS];
double poly[LOG2F_POLY_ORDER];
} __log2f_data attribute_hidden;
#define POWF_LOG2_TABLE_BITS 4
#define POWF_LOG2_POLY_ORDER 5
#if TOINT_INTRINSICS
# define POWF_SCALE_BITS EXP2F_TABLE_BITS
#else
# define POWF_SCALE_BITS 0
#endif
#define POWF_SCALE ((double) (1 << POWF_SCALE_BITS))
extern const struct powf_log2_data
{
struct
{
double invc, logc;
} tab[1 << POWF_LOG2_TABLE_BITS];
double poly[POWF_LOG2_POLY_ORDER];
} __powf_log2_data attribute_hidden;
#endif