49a7d46a06
(powerpc: Implement smp_cond_load_relaxed()) added busy-waiting pausing with a preferred SMT priority pattern, lowering the priority (reducing decode cycles) during the whole loop slowpath. However, data shows that while this pattern works well with simple spinlocks, queued spinlocks benefit more being kept in medium priority, with a cpu_relax() instead, being a low+medium combo on powerpc. Data is from three benchmarks on a Power9: 9008-22L 64 CPUs with 2 sockets and 8 threads per core. 1. locktorture. This is data for the lowest and most artificial/pathological level, with increasing thread counts pounding on the lock. Metrics are total ops/minute. Despite some small hits in the 4-8 range, scenarios are either neutral or favorable to this patch. +=========+==========+==========+=======+ | # tasks | vanilla | dirty | %diff | +=========+==========+==========+=======+ | 2 | 46718565 | 48751350 | 4.35 | +---------+----------+----------+-------+ | 4 | 51740198 | 50369082 | -2.65 | +---------+----------+----------+-------+ | 8 | 63756510 | 62568821 | -1.86 | +---------+----------+----------+-------+ | 16 | 67824531 | 70966546 | 4.63 | +---------+----------+----------+-------+ | 32 | 53843519 | 61155508 | 13.58 | +---------+----------+----------+-------+ | 64 | 53005778 | 53104412 | 0.18 | +---------+----------+----------+-------+ | 128 | 53331980 | 54606910 | 2.39 | +=========+==========+==========+=======+ 2. sockperf (tcp throughput) Here a client will do one-way throughput tests to a localhost server, with increasing message sizes, dealing with the sk_lock. This patch shows to put the performance of the qspinlock back to par with that of the simple lock: simple-spinlock vanilla dirty Hmean 14 73.50 ( 0.00%) 54.44 * -25.93%* 73.45 * -0.07%* Hmean 100 654.47 ( 0.00%) 385.61 * -41.08%* 771.43 * 17.87%* Hmean 300 2719.39 ( 0.00%) 2181.67 * -19.77%* 2666.50 * -1.94%* Hmean 500 4400.59 ( 0.00%) 3390.77 * -22.95%* 4322.14 * -1.78%* Hmean 850 6726.21 ( 0.00%) 5264.03 * -21.74%* 6863.12 * 2.04%* 3. dbench (tmpfs) Configured to run with up to ncpusx8 clients, it shows both latency and throughput metrics. For the latency, with the exception of the 64 case, there is really nothing to go by: vanilla dirty Amean latency-1 1.67 ( 0.00%) 1.67 * 0.09%* Amean latency-2 2.15 ( 0.00%) 2.08 * 3.36%* Amean latency-4 2.50 ( 0.00%) 2.56 * -2.27%* Amean latency-8 2.49 ( 0.00%) 2.48 * 0.31%* Amean latency-16 2.69 ( 0.00%) 2.72 * -1.37%* Amean latency-32 2.96 ( 0.00%) 3.04 * -2.60%* Amean latency-64 7.78 ( 0.00%) 8.17 * -5.07%* Amean latency-512 186.91 ( 0.00%) 186.41 * 0.27%* For the dbench4 Throughput (misleading but traditional) there's a small but rather constant improvement: vanilla dirty Hmean 1 849.13 ( 0.00%) 851.51 * 0.28%* Hmean 2 1664.03 ( 0.00%) 1663.94 * -0.01%* Hmean 4 3073.70 ( 0.00%) 3104.29 * 1.00%* Hmean 8 5624.02 ( 0.00%) 5694.16 * 1.25%* Hmean 16 9169.49 ( 0.00%) 9324.43 * 1.69%* Hmean 32 11969.37 ( 0.00%) 12127.09 * 1.32%* Hmean 64 15021.12 ( 0.00%) 15243.14 * 1.48%* Hmean 512 14891.27 ( 0.00%) 15162.11 * 1.82%* Measuring the dbench4 Per-VFS Operation latency, shows some very minor differences within the noise level, around the 0-1% ranges. Fixes:49a7d46a06
("powerpc: Implement smp_cond_load_relaxed()") Acked-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20210318204702.71417-1-dave@stgolabs.net
84 lines
2 KiB
C
84 lines
2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_POWERPC_QSPINLOCK_H
|
|
#define _ASM_POWERPC_QSPINLOCK_H
|
|
|
|
#include <asm-generic/qspinlock_types.h>
|
|
#include <asm/paravirt.h>
|
|
|
|
#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
|
|
|
|
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
|
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
|
|
extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
|
|
extern void __pv_queued_spin_unlock(struct qspinlock *lock);
|
|
|
|
static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
|
|
{
|
|
if (!is_shared_processor())
|
|
native_queued_spin_lock_slowpath(lock, val);
|
|
else
|
|
__pv_queued_spin_lock_slowpath(lock, val);
|
|
}
|
|
|
|
#define queued_spin_unlock queued_spin_unlock
|
|
static inline void queued_spin_unlock(struct qspinlock *lock)
|
|
{
|
|
if (!is_shared_processor())
|
|
smp_store_release(&lock->locked, 0);
|
|
else
|
|
__pv_queued_spin_unlock(lock);
|
|
}
|
|
|
|
#else
|
|
extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
|
|
#endif
|
|
|
|
static __always_inline void queued_spin_lock(struct qspinlock *lock)
|
|
{
|
|
u32 val = 0;
|
|
|
|
if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
|
|
return;
|
|
|
|
queued_spin_lock_slowpath(lock, val);
|
|
}
|
|
#define queued_spin_lock queued_spin_lock
|
|
|
|
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
|
#define SPIN_THRESHOLD (1<<15) /* not tuned */
|
|
|
|
static __always_inline void pv_wait(u8 *ptr, u8 val)
|
|
{
|
|
if (*ptr != val)
|
|
return;
|
|
yield_to_any();
|
|
/*
|
|
* We could pass in a CPU here if waiting in the queue and yield to
|
|
* the previous CPU in the queue.
|
|
*/
|
|
}
|
|
|
|
static __always_inline void pv_kick(int cpu)
|
|
{
|
|
prod_cpu(cpu);
|
|
}
|
|
|
|
extern void __pv_init_lock_hash(void);
|
|
|
|
static inline void pv_spinlocks_init(void)
|
|
{
|
|
__pv_init_lock_hash();
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
* Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait,
|
|
* which was found to have performance problems if implemented with
|
|
* the preferred spin_begin()/spin_end() SMT priority pattern. Use the
|
|
* generic version instead.
|
|
*/
|
|
|
|
#include <asm-generic/qspinlock.h>
|
|
|
|
#endif /* _ASM_POWERPC_QSPINLOCK_H */
|