mirror of
git://sourceware.org/git/glibc.git
synced 2025-03-06 20:58:33 +01:00
x86: Use Avoid_Non_Temporal_Memset
to control non-temporal path
This is just a refactor and there should be no behavioral change from this commit. The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob for controlling whether we use non-temporal memset rather than having extra logic based on vendor. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
7da0886247
commit
b93dddfaf4
2 changed files with 23 additions and 8 deletions
|
@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
unsigned int stepping = 0;
|
unsigned int stepping = 0;
|
||||||
enum cpu_features_kind kind;
|
enum cpu_features_kind kind;
|
||||||
|
|
||||||
|
/* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
||||||
|
as of writing this, we only have benchmarks indicatings it profitability
|
||||||
|
on Intel/AMD. */
|
||||||
|
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
|= bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
|
||||||
cpu_features->cachesize_non_temporal_divisor = 4;
|
cpu_features->cachesize_non_temporal_divisor = 4;
|
||||||
#if !HAS_CPUID
|
#if !HAS_CPUID
|
||||||
if (__get_cpuid_max (0, 0) == 0)
|
if (__get_cpuid_max (0, 0) == 0)
|
||||||
|
@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
|
|
||||||
update_active (cpu_features);
|
update_active (cpu_features);
|
||||||
|
|
||||||
|
/* Benchmarks indicate non-temporal memset can be profitable on Intel
|
||||||
|
hardware. */
|
||||||
|
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
&= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
|
||||||
if (family == 0x06)
|
if (family == 0x06)
|
||||||
{
|
{
|
||||||
model += extended_model;
|
model += extended_model;
|
||||||
|
@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
|
||||||
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
||||||
|
|
||||||
|
/* Benchmarks indicate non-temporal memset can be profitable on AMD
|
||||||
|
hardware. */
|
||||||
|
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
&= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
||||||
{
|
{
|
||||||
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
||||||
|
|
|
@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
rep_movsb_threshold = 2112;
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
/* Non-temporal stores are more performant on Intel and AMD hardware above
|
|
||||||
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
|
||||||
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
|
||||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
|
||||||
&& (cpu_features->basic.kind == arch_kind_intel
|
|
||||||
|| cpu_features->basic.kind == arch_kind_amd))
|
|
||||||
memset_non_temporal_threshold = non_temporal_threshold;
|
|
||||||
|
|
||||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
cases slower than the vectorized path (and for some alignments,
|
cases slower than the vectorized path (and for some alignments,
|
||||||
it is really slow, check BZ #30994). */
|
it is really slow, check BZ #30994). */
|
||||||
|
@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
if (tunable_size != 0)
|
if (tunable_size != 0)
|
||||||
shared = tunable_size;
|
shared = tunable_size;
|
||||||
|
|
||||||
|
/* Non-temporal stores are more performant on some hardware above
|
||||||
|
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
||||||
|
Intel and AMD hardware. */
|
||||||
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||||
|
memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
||||||
if (tunable_size > minimum_non_temporal_threshold
|
if (tunable_size > minimum_non_temporal_threshold
|
||||||
&& tunable_size <= maximum_non_temporal_threshold)
|
&& tunable_size <= maximum_non_temporal_threshold)
|
||||||
|
|
Loading…
Add table
Reference in a new issue