mirror of
git://sourceware.org/git/glibc.git
synced 2025-03-06 20:58:33 +01:00
x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always the same. This includes both the exact value and whether non-temporal stores are profitable at all for a given arch. This patch add `x86_memset_non_temporal_threshold`. Currently we disable non-temporal stores for non Intel vendors as the only benchmarks showing its benefit have been on Intel hardware. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
5bf0ab8057
commit
46b5e98ef6
7 changed files with 49 additions and 6 deletions
|
@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
|
||||||
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
|
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
|
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
|
||||||
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||||
|
glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||||
glibc.cpu.x86_shstk:
|
glibc.cpu.x86_shstk:
|
||||||
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
|
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
|
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
|
@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages.
|
||||||
@cindex shared_cache_size tunables
|
@cindex shared_cache_size tunables
|
||||||
@cindex tunables, shared_cache_size
|
@cindex tunables, shared_cache_size
|
||||||
@cindex non_temporal_threshold tunables
|
@cindex non_temporal_threshold tunables
|
||||||
@cindex tunables, non_temporal_threshold
|
@cindex memset_non_temporal_threshold tunables
|
||||||
|
@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
|
||||||
|
|
||||||
@deftp {Tunable namespace} glibc.cpu
|
@deftp {Tunable namespace} glibc.cpu
|
||||||
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
|
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
|
||||||
|
@ -574,6 +576,18 @@ like memmove and memcpy.
|
||||||
This tunable is specific to i386 and x86-64.
|
This tunable is specific to i386 and x86-64.
|
||||||
@end deftp
|
@end deftp
|
||||||
|
|
||||||
|
@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
|
||||||
|
The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
|
||||||
|
the user to set threshold in bytes for non temporal store in
|
||||||
|
memset. Non temporal stores give a hint to the hardware to move data
|
||||||
|
directly to memory without displacing other data from the cache. This
|
||||||
|
tunable is used by some platforms to determine when to use non
|
||||||
|
temporal stores memset.
|
||||||
|
|
||||||
|
This tunable is specific to i386 and x86-64.
|
||||||
|
@end deftp
|
||||||
|
|
||||||
|
|
||||||
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
|
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
|
||||||
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
|
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
|
||||||
set threshold in bytes to start using "rep movsb". The value must be
|
set threshold in bytes to start using "rep movsb". The value must be
|
||||||
|
|
|
@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
|
||||||
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
||||||
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
||||||
|
|
||||||
/* Threshold to use non temporal store. */
|
/* Threshold to use non temporal store in memmove. */
|
||||||
long int __x86_shared_non_temporal_threshold attribute_hidden;
|
long int __x86_shared_non_temporal_threshold attribute_hidden;
|
||||||
|
|
||||||
|
/* Threshold to use non temporal store in memset. */
|
||||||
|
long int __x86_memset_non_temporal_threshold attribute_hidden;
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP MOVSB. */
|
/* Threshold to use Enhanced REP MOVSB. */
|
||||||
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
||||||
|
|
||||||
|
@ -77,6 +80,9 @@ init_cacheinfo (void)
|
||||||
__x86_shared_non_temporal_threshold
|
__x86_shared_non_temporal_threshold
|
||||||
= cpu_features->non_temporal_threshold;
|
= cpu_features->non_temporal_threshold;
|
||||||
|
|
||||||
|
__x86_memset_non_temporal_threshold
|
||||||
|
= cpu_features->memset_non_temporal_threshold;
|
||||||
|
|
||||||
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
||||||
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
|
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
|
||||||
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
|
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
|
||||||
|
|
|
@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
rep_movsb_threshold = 2112;
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
/* Non-temporal stores in memset have only been tested on Intel hardware.
|
||||||
|
Until we benchmark data on other x86 processor, disable non-temporal
|
||||||
|
stores in memset. */
|
||||||
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
if (cpu_features->basic.kind == arch_kind_intel)
|
||||||
|
memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
cases slower than the vectorized path (and for some alignments,
|
cases slower than the vectorized path (and for some alignments,
|
||||||
it is really slow, check BZ #30994). */
|
it is really slow, check BZ #30994). */
|
||||||
|
@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
&& tunable_size <= maximum_non_temporal_threshold)
|
&& tunable_size <= maximum_non_temporal_threshold)
|
||||||
non_temporal_threshold = tunable_size;
|
non_temporal_threshold = tunable_size;
|
||||||
|
|
||||||
|
tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
|
||||||
|
if (tunable_size > minimum_non_temporal_threshold
|
||||||
|
&& tunable_size <= maximum_non_temporal_threshold)
|
||||||
|
memset_non_temporal_threshold = tunable_size;
|
||||||
|
|
||||||
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
|
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
|
||||||
if (tunable_size > minimum_rep_movsb_threshold)
|
if (tunable_size > minimum_rep_movsb_threshold)
|
||||||
rep_movsb_threshold = tunable_size;
|
rep_movsb_threshold = tunable_size;
|
||||||
|
@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||||
minimum_non_temporal_threshold,
|
minimum_non_temporal_threshold,
|
||||||
maximum_non_temporal_threshold);
|
maximum_non_temporal_threshold);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (
|
||||||
|
x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
|
||||||
|
minimum_non_temporal_threshold, maximum_non_temporal_threshold);
|
||||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||||
minimum_rep_movsb_threshold, SIZE_MAX);
|
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||||
|
@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
cpu_features->data_cache_size = data;
|
cpu_features->data_cache_size = data;
|
||||||
cpu_features->shared_cache_size = shared;
|
cpu_features->shared_cache_size = shared;
|
||||||
cpu_features->non_temporal_threshold = non_temporal_threshold;
|
cpu_features->non_temporal_threshold = non_temporal_threshold;
|
||||||
|
cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
|
||||||
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
|
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
|
||||||
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
|
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
|
||||||
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
|
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
|
||||||
|
|
|
@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void)
|
||||||
cpu_features->shared_cache_size);
|
cpu_features->shared_cache_size);
|
||||||
print_cpu_features_value ("non_temporal_threshold",
|
print_cpu_features_value ("non_temporal_threshold",
|
||||||
cpu_features->non_temporal_threshold);
|
cpu_features->non_temporal_threshold);
|
||||||
|
print_cpu_features_value ("memset_non_temporal_threshold",
|
||||||
|
cpu_features->memset_non_temporal_threshold);
|
||||||
print_cpu_features_value ("rep_movsb_threshold",
|
print_cpu_features_value ("rep_movsb_threshold",
|
||||||
cpu_features->rep_movsb_threshold);
|
cpu_features->rep_movsb_threshold);
|
||||||
print_cpu_features_value ("rep_movsb_stop_threshold",
|
print_cpu_features_value ("rep_movsb_stop_threshold",
|
||||||
|
|
|
@ -30,6 +30,9 @@ glibc {
|
||||||
x86_non_temporal_threshold {
|
x86_non_temporal_threshold {
|
||||||
type: SIZE_T
|
type: SIZE_T
|
||||||
}
|
}
|
||||||
|
x86_memset_non_temporal_threshold {
|
||||||
|
type: SIZE_T
|
||||||
|
}
|
||||||
x86_rep_movsb_threshold {
|
x86_rep_movsb_threshold {
|
||||||
type: SIZE_T
|
type: SIZE_T
|
||||||
# Since there is overhead to set up REP MOVSB operation, REP
|
# Since there is overhead to set up REP MOVSB operation, REP
|
||||||
|
|
|
@ -944,8 +944,10 @@ struct cpu_features
|
||||||
/* Shared cache size for use in memory and string routines, typically
|
/* Shared cache size for use in memory and string routines, typically
|
||||||
L2 or L3 size. */
|
L2 or L3 size. */
|
||||||
unsigned long int shared_cache_size;
|
unsigned long int shared_cache_size;
|
||||||
/* Threshold to use non temporal store. */
|
/* Threshold to use non temporal store in memmove. */
|
||||||
unsigned long int non_temporal_threshold;
|
unsigned long int non_temporal_threshold;
|
||||||
|
/* Threshold to use non temporal store in memset. */
|
||||||
|
unsigned long int memset_non_temporal_threshold;
|
||||||
/* Threshold to use "rep movsb". */
|
/* Threshold to use "rep movsb". */
|
||||||
unsigned long int rep_movsb_threshold;
|
unsigned long int rep_movsb_threshold;
|
||||||
/* Threshold to stop using "rep movsb". */
|
/* Threshold to stop using "rep movsb". */
|
||||||
|
|
|
@ -24,9 +24,9 @@
|
||||||
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
||||||
4 VEC stores and store 4 * VEC at a time until done.
|
4 VEC stores and store 4 * VEC at a time until done.
|
||||||
6. On machines ERMS feature, if size is range
|
6. On machines ERMS feature, if size is range
|
||||||
[__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
[__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
|
||||||
then REP STOSB will be used.
|
then REP STOSB will be used.
|
||||||
7. If size >= __x86_shared_non_temporal_threshold, use a
|
7. If size >= __x86_memset_non_temporal_threshold, use a
|
||||||
non-temporal stores. */
|
non-temporal stores. */
|
||||||
|
|
||||||
#include <sysdep.h>
|
#include <sysdep.h>
|
||||||
|
@ -318,7 +318,7 @@ L(return_vzeroupper):
|
||||||
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||||
range for 2-byte jump encoding. */
|
range for 2-byte jump encoding. */
|
||||||
L(stosb_local):
|
L(stosb_local):
|
||||||
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
|
||||||
jae L(nt_memset)
|
jae L(nt_memset)
|
||||||
movzbl %sil, %eax
|
movzbl %sil, %eax
|
||||||
mov %RDX_LP, %RCX_LP
|
mov %RDX_LP, %RCX_LP
|
||||||
|
|
Loading…
Add table
Reference in a new issue