Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets to improve code using cmpxchg64_local(). On 64-bit targets, the generated assembly improves from: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 48 85 c0 test %rax,%rax 3e32: 0f 85 9f 00 00 00 jne 3ed7 <...> to: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...> where a TEST instruction after CMPXCHG is saved. The improvements for 32-bit targets are even more noticeable, because double-word compare after CMPXCHG8B gets eliminated. Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Waiman Long <longman@redhat.com> Link: https://lore.kernel.org/r/20240414161257.49145-1-ubizjak@gmail.com
96 lines
2.5 KiB
C
96 lines
2.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_CMPXCHG_64_H
|
|
#define _ASM_X86_CMPXCHG_64_H
|
|
|
|
#define arch_cmpxchg64(ptr, o, n) \
|
|
({ \
|
|
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
|
|
arch_cmpxchg((ptr), (o), (n)); \
|
|
})
|
|
|
|
#define arch_cmpxchg64_local(ptr, o, n) \
|
|
({ \
|
|
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
|
|
arch_cmpxchg_local((ptr), (o), (n)); \
|
|
})
|
|
|
|
#define arch_try_cmpxchg64(ptr, po, n) \
|
|
({ \
|
|
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
|
|
arch_try_cmpxchg((ptr), (po), (n)); \
|
|
})
|
|
|
|
#define arch_try_cmpxchg64_local(ptr, po, n) \
|
|
({ \
|
|
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
|
|
arch_try_cmpxchg_local((ptr), (po), (n)); \
|
|
})
|
|
|
|
union __u128_halves {
|
|
u128 full;
|
|
struct {
|
|
u64 low, high;
|
|
};
|
|
};
|
|
|
|
#define __arch_cmpxchg128(_ptr, _old, _new, _lock) \
|
|
({ \
|
|
union __u128_halves o = { .full = (_old), }, \
|
|
n = { .full = (_new), }; \
|
|
\
|
|
asm volatile(_lock "cmpxchg16b %[ptr]" \
|
|
: [ptr] "+m" (*(_ptr)), \
|
|
"+a" (o.low), "+d" (o.high) \
|
|
: "b" (n.low), "c" (n.high) \
|
|
: "memory"); \
|
|
\
|
|
o.full; \
|
|
})
|
|
|
|
static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
|
|
{
|
|
return __arch_cmpxchg128(ptr, old, new, LOCK_PREFIX);
|
|
}
|
|
#define arch_cmpxchg128 arch_cmpxchg128
|
|
|
|
static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
|
|
{
|
|
return __arch_cmpxchg128(ptr, old, new,);
|
|
}
|
|
#define arch_cmpxchg128_local arch_cmpxchg128_local
|
|
|
|
#define __arch_try_cmpxchg128(_ptr, _oldp, _new, _lock) \
|
|
({ \
|
|
union __u128_halves o = { .full = *(_oldp), }, \
|
|
n = { .full = (_new), }; \
|
|
bool ret; \
|
|
\
|
|
asm volatile(_lock "cmpxchg16b %[ptr]" \
|
|
CC_SET(e) \
|
|
: CC_OUT(e) (ret), \
|
|
[ptr] "+m" (*(_ptr)), \
|
|
"+a" (o.low), "+d" (o.high) \
|
|
: "b" (n.low), "c" (n.high) \
|
|
: "memory"); \
|
|
\
|
|
if (unlikely(!ret)) \
|
|
*(_oldp) = o.full; \
|
|
\
|
|
likely(ret); \
|
|
})
|
|
|
|
static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
|
|
{
|
|
return __arch_try_cmpxchg128(ptr, oldp, new, LOCK_PREFIX);
|
|
}
|
|
#define arch_try_cmpxchg128 arch_try_cmpxchg128
|
|
|
|
static __always_inline bool arch_try_cmpxchg128_local(volatile u128 *ptr, u128 *oldp, u128 new)
|
|
{
|
|
return __arch_try_cmpxchg128(ptr, oldp, new,);
|
|
}
|
|
#define arch_try_cmpxchg128_local arch_try_cmpxchg128_local
|
|
|
|
#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16)
|
|
|
|
#endif /* _ASM_X86_CMPXCHG_64_H */
|