x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation
Some variables in pcpu_hot, currently current_task and top_of_stack are actually per-thread variables implemented as per-CPU variables and thus stable for the duration of the respective task. There is already an attempt to eliminate redundant reads from these variables using this_cpu_read_stable() asm macro, which hides the dependency on the read memory address. However, the compiler has limited ability to eliminate asm common subexpressions, so this approach results in a limited success. The solution is to allow more aggressive elimination by aliasing pcpu_hot into a const-qualified const_pcpu_hot, and to read stable per-CPU variables from this constant copy. The current per-CPU infrastructure does not support reads from const-qualified variables. However, when the compiler supports segment qualifiers, it is possible to declare the const-aliased variable in the relevant named address space. The compiler considers access to the variable, declared in this way, as a read from a constant location, and will optimize reads from the variable accordingly. By implementing constant-qualified const_pcpu_hot, the compiler can eliminate redundant reads from the constant variables, reducing the number of loads from current_task from 3766 to 3217 on a test build, a -14.6% reduction. The reduction of loads translates to the following code savings: text data bss dec hex filename 25,477,353 4389456 808452 30675261 1d4113d vmlinux-old.o 25,476,074 4389440 808452 30673966 1d40c2e vmlinux-new.o representing a code size reduction of -1279 bytes. [ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ] Co-developed-by: Nadav Amit <namit@vmware.com> Signed-off-by: Nadav Amit <namit@vmware.com> Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com
This commit is contained in:
parent
59bec00ace
commit
ed2f752e0e
6 changed files with 16 additions and 4 deletions
|
@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);
|
|||
|
||||
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
|
||||
|
||||
/* const-qualified alias to pcpu_hot, aliased by linker. */
|
||||
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
|
||||
const_pcpu_hot);
|
||||
|
||||
static __always_inline struct task_struct *get_current(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
|
||||
return const_pcpu_hot.current_task;
|
||||
|
||||
return this_cpu_read_stable(pcpu_hot.current_task);
|
||||
}
|
||||
|
||||
|
|
|
@ -413,9 +413,9 @@ do { \
|
|||
* accessed while this_cpu_read_stable() allows the value to be cached.
|
||||
* this_cpu_read_stable() is more efficient and can be used if its value
|
||||
* is guaranteed to be valid across cpus. The current users include
|
||||
* get_current() and get_thread_info() both of which are actually
|
||||
* per-thread variables implemented as per-cpu variables and thus
|
||||
* stable for the duration of the respective task.
|
||||
* pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
|
||||
* actually per-thread variables implemented as per-CPU variables and
|
||||
* thus stable for the duration of the respective task.
|
||||
*/
|
||||
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
|
||||
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
|
||||
|
|
|
@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void)
|
|||
* and around vm86 mode and sp0 on x86_64 is special because of the
|
||||
* entry trampoline.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
|
||||
return pcpu_hot.top_of_stack;
|
||||
|
||||
return this_cpu_read_stable(pcpu_hot.top_of_stack);
|
||||
}
|
||||
|
||||
|
|
|
@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
|
|||
.top_of_stack = TOP_OF_INIT_STACK,
|
||||
};
|
||||
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
|
||||
EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
|
||||
|
|
|
@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
|
|||
#endif
|
||||
|
||||
jiffies = jiffies_64;
|
||||
const_pcpu_hot = pcpu_hot;
|
||||
|
||||
#if defined(CONFIG_X86_64)
|
||||
/*
|
||||
|
|
|
@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
|
|||
*/
|
||||
#define ___ADDRESSABLE(sym, __attrs) \
|
||||
static void * __used __attrs \
|
||||
__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
|
||||
__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
|
||||
#define __ADDRESSABLE(sym) \
|
||||
___ADDRESSABLE(sym, __section(".discard.addressable"))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue