linux/arch/powerpc/include/asm/qspinlock.h

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_POWERPC_QSPINLOCK_H
#define _ASM_POWERPC_QSPINLOCK_H

#include <asm-generic/qspinlock_types.h>
#include <asm/paravirt.h>

#define _Q_PENDING_LOOPS	(1 << 9) /* not tuned */

#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_queued_spin_unlock(struct qspinlock *lock);

static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
	if (!is_shared_processor())
		native_queued_spin_lock_slowpath(lock, val);
	else
		__pv_queued_spin_lock_slowpath(lock, val);
}

#define queued_spin_unlock queued_spin_unlock
static inline void queued_spin_unlock(struct qspinlock *lock)
{
	if (!is_shared_processor())
		smp_store_release(&lock->locked, 0);
	else
		__pv_queued_spin_unlock(lock);
}

#else
extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
#endif

static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
	u32 val = 0;

	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
		return;

	queued_spin_lock_slowpath(lock, val);
}
#define queued_spin_lock queued_spin_lock

#define smp_mb__after_spinlock()   smp_mb()

static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
	/*
	 * This barrier was added to simple spinlocks by commit 51d7d5205d338,
	 * but it should now be possible to remove it, asm arm64 has done with
	 * commit c6f5d02b6a0f.
	 */
	smp_mb();
	return atomic_read(&lock->val);
}
#define queued_spin_is_locked queued_spin_is_locked

#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define SPIN_THRESHOLD (1<<15) /* not tuned */

static __always_inline void pv_wait(u8 *ptr, u8 val)
{
	if (*ptr != val)
		return;
	yield_to_any();
	/*
	 * We could pass in a CPU here if waiting in the queue and yield to
	 * the previous CPU in the queue.
	 */
}

static __always_inline void pv_kick(int cpu)
{
	prod_cpu(cpu);
}

extern void __pv_init_lock_hash(void);

static inline void pv_spinlocks_init(void)
{
	__pv_init_lock_hash();
}

#endif

#include <asm-generic/qspinlock.h>

#endif /* _ASM_POWERPC_QSPINLOCK_H */
powerpc/64s: Implement queued spinlocks and rwlocks These have shown significantly improved performance and fairness when spinlock contention is moderate to high on very large systems. With this series including subsequent patches, on a 16 socket 1536 thread POWER9, a stress test such as same-file open/close from all CPUs gets big speedups, 11620op/s aggregate with simple spinlocks vs 384158op/s (33x faster), where the difference in throughput between the fastest and slowest thread goes from 7x to 1.4x. Thanks to the fast path being identical in terms of atomics and barriers (after a subsequent optimisation patch), single threaded performance is not changed (no measurable difference). On smaller systems, performance and fairness seems to be generally improved. Using dbench on tmpfs as a test (that starts to run into kernel spinlock contention), a 2-socket OpenPOWER POWER9 system was tested with bare metal and KVM guest configurations. Results can be found here: https://github.com/linuxppc/issues/issues/305#issuecomment-663487453 Observations are: - Queued spinlocks are equal when contention is insignificant, as expected and as measured with microbenchmarks. - When there is contention, on bare metal queued spinlocks have better throughput and max latency at all points. - When virtualised, queued spinlocks are slightly worse approaching peak throughput, but significantly better throughput and max latency at all points beyond peak, until queued spinlock maximum latency rises when clients are 2x vCPUs. The regressions haven't been analysed very well yet, there are a lot of things that can be tuned, particularly the paravirtualised locking, but the numbers already look like a good net win even on relatively small systems. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-4-npiggin@gmail.com 2020-07-24 21:14:20 +08:00			`/* SPDX-License-Identifier: GPL-2.0 */`
			`#ifndef _ASM_POWERPC_QSPINLOCK_H`
			`#define _ASM_POWERPC_QSPINLOCK_H`

			`#include <asm-generic/qspinlock_types.h>`
powerpc/pseries: Implement paravirt qspinlocks for SPLPAR This implements the generic paravirt qspinlocks using H_PROD and H_CONFER to kick and wait. This uses an un-directed yield to any CPU rather than the directed yield to a pre-empted lock holder that paravirtualised simple spinlocks use, that requires no kick hcall. This is something that could be investigated and improved in future. Performance results can be found in the commit which added queued spinlocks. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-5-npiggin@gmail.com 2020-07-24 21:14:21 +08:00			`#include <asm/paravirt.h>`
powerpc/64s: Implement queued spinlocks and rwlocks These have shown significantly improved performance and fairness when spinlock contention is moderate to high on very large systems. With this series including subsequent patches, on a 16 socket 1536 thread POWER9, a stress test such as same-file open/close from all CPUs gets big speedups, 11620op/s aggregate with simple spinlocks vs 384158op/s (33x faster), where the difference in throughput between the fastest and slowest thread goes from 7x to 1.4x. Thanks to the fast path being identical in terms of atomics and barriers (after a subsequent optimisation patch), single threaded performance is not changed (no measurable difference). On smaller systems, performance and fairness seems to be generally improved. Using dbench on tmpfs as a test (that starts to run into kernel spinlock contention), a 2-socket OpenPOWER POWER9 system was tested with bare metal and KVM guest configurations. Results can be found here: https://github.com/linuxppc/issues/issues/305#issuecomment-663487453 Observations are: - Queued spinlocks are equal when contention is insignificant, as expected and as measured with microbenchmarks. - When there is contention, on bare metal queued spinlocks have better throughput and max latency at all points. - When virtualised, queued spinlocks are slightly worse approaching peak throughput, but significantly better throughput and max latency at all points beyond peak, until queued spinlock maximum latency rises when clients are 2x vCPUs. The regressions haven't been analysed very well yet, there are a lot of things that can be tuned, particularly the paravirtualised locking, but the numbers already look like a good net win even on relatively small systems. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-4-npiggin@gmail.com 2020-07-24 21:14:20 +08:00
			`#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */`

powerpc/pseries: Implement paravirt qspinlocks for SPLPAR This implements the generic paravirt qspinlocks using H_PROD and H_CONFER to kick and wait. This uses an un-directed yield to any CPU rather than the directed yield to a pre-empted lock holder that paravirtualised simple spinlocks use, that requires no kick hcall. This is something that could be investigated and improved in future. Performance results can be found in the commit which added queued spinlocks. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-5-npiggin@gmail.com 2020-07-24 21:14:21 +08:00			`#ifdef CONFIG_PARAVIRT_SPINLOCKS`
			`extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);`
			`extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);`
			`extern void __pv_queued_spin_unlock(struct qspinlock *lock);`

			`static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)`
			`{`
			`if (!is_shared_processor())`
			`native_queued_spin_lock_slowpath(lock, val);`
			`else`
			`__pv_queued_spin_lock_slowpath(lock, val);`
			`}`

			`#define queued_spin_unlock queued_spin_unlock`
			`static inline void queued_spin_unlock(struct qspinlock *lock)`
			`{`
			`if (!is_shared_processor())`
			`smp_store_release(&lock->locked, 0);`
			`else`
			`__pv_queued_spin_unlock(lock);`
			`}`

			`#else`
			`extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);`
			`#endif`

			`static __always_inline void queued_spin_lock(struct qspinlock *lock)`
			`{`
			`u32 val = 0;`

powerpc/qspinlock: Optimised atomic_try_cmpxchg_lock() that adds the lock hint This brings the behaviour of the uncontended fast path back to roughly equivalent to simple spinlocks -- a single atomic op with lock hint. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-6-npiggin@gmail.com 2020-07-24 21:14:22 +08:00			`if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))`
powerpc/pseries: Implement paravirt qspinlocks for SPLPAR This implements the generic paravirt qspinlocks using H_PROD and H_CONFER to kick and wait. This uses an un-directed yield to any CPU rather than the directed yield to a pre-empted lock holder that paravirtualised simple spinlocks use, that requires no kick hcall. This is something that could be investigated and improved in future. Performance results can be found in the commit which added queued spinlocks. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-5-npiggin@gmail.com 2020-07-24 21:14:21 +08:00			`return;`

			`queued_spin_lock_slowpath(lock, val);`
			`}`
			`#define queued_spin_lock queued_spin_lock`

powerpc/64s: Implement queued spinlocks and rwlocks These have shown significantly improved performance and fairness when spinlock contention is moderate to high on very large systems. With this series including subsequent patches, on a 16 socket 1536 thread POWER9, a stress test such as same-file open/close from all CPUs gets big speedups, 11620op/s aggregate with simple spinlocks vs 384158op/s (33x faster), where the difference in throughput between the fastest and slowest thread goes from 7x to 1.4x. Thanks to the fast path being identical in terms of atomics and barriers (after a subsequent optimisation patch), single threaded performance is not changed (no measurable difference). On smaller systems, performance and fairness seems to be generally improved. Using dbench on tmpfs as a test (that starts to run into kernel spinlock contention), a 2-socket OpenPOWER POWER9 system was tested with bare metal and KVM guest configurations. Results can be found here: https://github.com/linuxppc/issues/issues/305#issuecomment-663487453 Observations are: - Queued spinlocks are equal when contention is insignificant, as expected and as measured with microbenchmarks. - When there is contention, on bare metal queued spinlocks have better throughput and max latency at all points. - When virtualised, queued spinlocks are slightly worse approaching peak throughput, but significantly better throughput and max latency at all points beyond peak, until queued spinlock maximum latency rises when clients are 2x vCPUs. The regressions haven't been analysed very well yet, there are a lot of things that can be tuned, particularly the paravirtualised locking, but the numbers already look like a good net win even on relatively small systems. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-4-npiggin@gmail.com 2020-07-24 21:14:20 +08:00			`#define smp_mb__after_spinlock() smp_mb()`

			`static __always_inline int queued_spin_is_locked(struct qspinlock *lock)`
			`{`
			`/*`
			`* This barrier was added to simple spinlocks by commit 51d7d5205d338,`
			`* but it should now be possible to remove it, asm arm64 has done with`
			`* commit c6f5d02b6a0f.`
			`*/`
			`smp_mb();`
			`return atomic_read(&lock->val);`
			`}`
			`#define queued_spin_is_locked queued_spin_is_locked`

powerpc/pseries: Implement paravirt qspinlocks for SPLPAR This implements the generic paravirt qspinlocks using H_PROD and H_CONFER to kick and wait. This uses an un-directed yield to any CPU rather than the directed yield to a pre-empted lock holder that paravirtualised simple spinlocks use, that requires no kick hcall. This is something that could be investigated and improved in future. Performance results can be found in the commit which added queued spinlocks. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-5-npiggin@gmail.com 2020-07-24 21:14:21 +08:00			`#ifdef CONFIG_PARAVIRT_SPINLOCKS`
			`#define SPIN_THRESHOLD (1<<15) /* not tuned */`

			`static __always_inline void pv_wait(u8 *ptr, u8 val)`
			`{`
			`if (*ptr != val)`
			`return;`
			`yield_to_any();`
			`/*`
			`* We could pass in a CPU here if waiting in the queue and yield to`
			`* the previous CPU in the queue.`
			`*/`
			`}`

			`static __always_inline void pv_kick(int cpu)`
			`{`
			`prod_cpu(cpu);`
			`}`

			`extern void __pv_init_lock_hash(void);`

			`static inline void pv_spinlocks_init(void)`
			`{`
			`__pv_init_lock_hash();`
			`}`

			`#endif`

powerpc/64s: Implement queued spinlocks and rwlocks These have shown significantly improved performance and fairness when spinlock contention is moderate to high on very large systems. With this series including subsequent patches, on a 16 socket 1536 thread POWER9, a stress test such as same-file open/close from all CPUs gets big speedups, 11620op/s aggregate with simple spinlocks vs 384158op/s (33x faster), where the difference in throughput between the fastest and slowest thread goes from 7x to 1.4x. Thanks to the fast path being identical in terms of atomics and barriers (after a subsequent optimisation patch), single threaded performance is not changed (no measurable difference). On smaller systems, performance and fairness seems to be generally improved. Using dbench on tmpfs as a test (that starts to run into kernel spinlock contention), a 2-socket OpenPOWER POWER9 system was tested with bare metal and KVM guest configurations. Results can be found here: https://github.com/linuxppc/issues/issues/305#issuecomment-663487453 Observations are: - Queued spinlocks are equal when contention is insignificant, as expected and as measured with microbenchmarks. - When there is contention, on bare metal queued spinlocks have better throughput and max latency at all points. - When virtualised, queued spinlocks are slightly worse approaching peak throughput, but significantly better throughput and max latency at all points beyond peak, until queued spinlock maximum latency rises when clients are 2x vCPUs. The regressions haven't been analysed very well yet, there are a lot of things that can be tuned, particularly the paravirtualised locking, but the numbers already look like a good net win even on relatively small systems. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200724131423.1362108-4-npiggin@gmail.com 2020-07-24 21:14:20 +08:00			`#include <asm-generic/qspinlock.h>`

			`#endif /* _ASM_POWERPC_QSPINLOCK_H */`