mirror of
git://sourceware.org/git/glibc.git
synced 2025-03-06 20:58:33 +01:00
The CORE-MATH implementation is correctly rounded (for any rounding mode) and shows better performance to the generic tanf. The code was adapted to glibc style, to use the definition of math_config.h, to remove errno handling, and to use a generic 128 bit routine for ABIs that do not support it natively. Benchtest on x64_64 (Ryzen 9 5900X, gcc 14.2.1), aarch64 (neoverse1, gcc 13.2.1), and powerpc (POWER10, gcc 13.2.1): latency master patched improvement x86_64 82.3961 54.8052 33.49% x86_64v2 82.3415 54.8052 33.44% x86_64v3 69.3661 50.4864 27.22% i686 219.271 45.5396 79.23% aarch64 29.2127 19.1951 34.29% power10 19.5060 16.2760 16.56% reciprocal-throughput master patched improvement x86_64 28.3976 19.7334 30.51% x86_64v2 28.4568 19.7334 30.65% x86_64v3 21.1815 16.1811 23.61% i686 105.016 15.1426 85.58% aarch64 18.1573 10.7681 40.70% power10 8.7207 8.7097 0.13% Signed-off-by: Alexei Sibidanov <sibid@uvic.ca> Signed-off-by: Paul Zimmermann <Paul.Zimmermann@inria.fr> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> Reviewed-by: DJ Delorie <dj@redhat.com>
150 lines
5 KiB
C
150 lines
5 KiB
C
/* Internal 128 bit int support.
|
|
Copyright (C) 2024 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#ifndef _MATH_INT128_H
|
|
#define _MATH_INT128_H
|
|
|
|
/* Limited support for internal 128 bit integer, used on some math
|
|
implementations. It uses compiler builtin type if supported, otherwise
|
|
it is emulated. Only unsigned and some operations are currently supported:
|
|
|
|
- u128_t: the 128 bit unsigned type.
|
|
- u128_high: return the high part of the number.
|
|
- u128_low: return the low part of the number.
|
|
- u128_from_u64: create a 128 bit number from a 64 bit one.
|
|
- u128_mul: multiply two 128 bit numbers.
|
|
- u128_add: add two 128 bit numbers.
|
|
- u128_lshift: left shift a number.
|
|
- u128_rshift: right shift a number.
|
|
*/
|
|
|
|
#if defined __BITINT_MAXWIDTH__ && __BITINT_MAXWIDTH__ >= 128
|
|
typedef unsigned _BitInt(128) u128;
|
|
# define __MATH_INT128_BUILTIN_TYPE 1
|
|
#elif defined __SIZEOF_INT128__
|
|
typedef unsigned __int128 u128;
|
|
# define __MATH_INT128_BUILTIN_TYPE 1
|
|
#else
|
|
# define __MATH_INT128_BUILTIN_TYPE 0
|
|
#endif
|
|
|
|
#if __MATH_INT128_BUILTIN_TYPE
|
|
# define u128_high(__x) (uint64_t)((__x) >> 64)
|
|
# define u128_low(__x) (uint64_t)(__x)
|
|
# define u128_from_u64(__x) (u128)(__x)
|
|
# define u128_mul(__x, __y) (__x) * (__y)
|
|
# define u128_add(__x, __y) (__x) + (__y)
|
|
# define u128_lshift(__x, __y) (__x) << (__y)
|
|
# define u128_rshift(__x, __y) (__x) >> (__y)
|
|
#else
|
|
typedef struct
|
|
{
|
|
uint64_t low;
|
|
uint64_t high;
|
|
} u128;
|
|
|
|
# define u128_high(__x) (__x).high
|
|
# define u128_low(__x) (__x).low
|
|
# define u128_from_u64(__x) (u128){.low = (__x), .high = 0}
|
|
|
|
# define MASK32 (UINT64_C(0xffffffff))
|
|
|
|
static u128 u128_add (u128 x, u128 y)
|
|
{
|
|
bool carry = x.low + y.low < x.low;
|
|
return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
|
|
}
|
|
|
|
static u128 u128_lshift (u128 x, unsigned int n)
|
|
{
|
|
switch (n)
|
|
{
|
|
case 0: return x;
|
|
case 1 ... 63: return (u128) { .high = (x.high << n) | (x.low >> (64 - n)),
|
|
.low = x.low << n };
|
|
case 64 ...127: return (u128) { .high = x.low << (n - 64), .low = 0};
|
|
default: return (u128) { .high = 0, .low = 0 };
|
|
}
|
|
}
|
|
|
|
static u128 u128_rshift (u128 x, unsigned int n)
|
|
{
|
|
switch (n)
|
|
{
|
|
case 0: return x;
|
|
case 1 ... 63: return (u128) { .high = x.high >> n,
|
|
.low = (x.high << (64 - n)) | (x.low >> n) };
|
|
case 64 ...127: return (u128) { .high = 0, .low = x.high >> (n - 64) };
|
|
default: return (u128) { .high = 0, .low = 0 };
|
|
}
|
|
}
|
|
|
|
static u128 u128_mul (u128 x, u128 y)
|
|
{
|
|
if (x.high == 0 && y.high == 0)
|
|
{
|
|
uint64_t x0 = x.low & MASK32;
|
|
uint64_t x1 = x.low >> 32;
|
|
uint64_t y0 = y.low & MASK32;
|
|
uint64_t y1 = y.low >> 32;
|
|
u128 x0y0 = { .high = 0, .low = x0 * y0 };
|
|
u128 x0y1 = { .high = 0, .low = x0 * y1 };
|
|
u128 x1y0 = { .high = 0, .low = x1 * y0 };
|
|
u128 x1y1 = { .high = x1 * y1, .low = 0 };
|
|
/* x0y0 + ((x0y1 + x1y0) << 32) + x1y1 */
|
|
return u128_add (u128_add (x0y0,
|
|
u128_lshift (u128_add (x0y1, x1y0),
|
|
32)),
|
|
x1y1);
|
|
}
|
|
else
|
|
{
|
|
uint64_t x0 = x.low & MASK32;
|
|
uint64_t x1 = x.low >> 32;
|
|
uint64_t x2 = x.high & MASK32;
|
|
uint64_t x3 = x.high >> 32;
|
|
uint64_t y0 = y.low & MASK32;
|
|
uint64_t y1 = y.low >> 32;
|
|
uint64_t y2 = y.high & MASK32;
|
|
uint64_t y3 = y.high >> 32;
|
|
u128 x0y0 = { .high = 0, .low = x0 * y0 };
|
|
u128 x0y1 = { .high = 0, .low = x0 * y1 };
|
|
u128 x0y2 = { .high = 0, .low = x0 * y2 };
|
|
u128 x0y3 = { .high = 0, .low = x0 * y3 };
|
|
u128 x1y0 = { .high = 0, .low = x1 * y0 };
|
|
u128 x1y1 = { .high = 0, .low = x1 * y1 };
|
|
u128 x1y2 = { .high = 0, .low = x1 * y2 };
|
|
u128 x2y0 = { .high = 0, .low = x2 * y0 };
|
|
u128 x2y1 = { .high = 0, .low = x2 * y1 };
|
|
u128 x3y0 = { .high = 0, .low = x3 * y0 };
|
|
/* x0y0 + ((x0y1 + x1y0) << 32) + ((x0y2 + x1y1 + x2y0) << 64) +
|
|
((x0y3 + x1y2 + x2y1 + x3y0) << 96) */
|
|
u128 r0 = u128_add (x0y0,
|
|
u128_lshift (u128_add (x0y1, x1y0),
|
|
32));
|
|
u128 r1 = u128_add (u128_lshift (u128_add (u128_add (x0y2, x1y1), x2y0),
|
|
64),
|
|
u128_lshift (u128_add (u128_add (x0y3, x1y2),
|
|
u128_add (x2y1, x3y0)),
|
|
96));
|
|
return u128_add (r0, r1);
|
|
}
|
|
}
|
|
#endif /* __SIZEOF_INT128__ */
|
|
|
|
#endif
|