mirror of https://gitee.com/openkylin/linux.git
smp: Avoid using two cache lines for struct call_single_data
struct call_single_data is used in IPIs to transfer information between CPUs. Its size is bigger than sizeof(unsigned long) and less than cache line size. Currently it is not allocated with any explicit alignment requirements. This makes it possible for allocated call_single_data to cross two cache lines, which results in double the number of the cache lines that need to be transferred among CPUs. This can be fixed by requiring call_single_data to be aligned with the size of call_single_data. Currently the size of call_single_data is the power of 2. If we add new fields to call_single_data, we may need to add padding to make sure the size of new definition is the power of 2 as well. Fortunately, this is enforced by GCC, which will report bad sizes. To set alignment requirements of call_single_data to the size of call_single_data, a struct definition and a typedef is used. To test the effect of the patch, I used the vm-scalability multiple thread swap test case (swap-w-seq-mt). The test will create multiple threads and each thread will eat memory until all RAM and part of swap is used, so that huge number of IPIs are triggered when unmapping memory. In the test, the throughput of memory writing improves ~5% compared with misaligned call_single_data, because of faster IPIs. Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Huang, Ying <ying.huang@intel.com> [ Add call_single_data_t and align with size of call_single_data. ] Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Borislav Petkov <bp@suse.de> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Juergen Gross <jgross@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
f52be57080
commit
966a967116
arch/mips/kernel
block
drivers
include/linux
kernel
|
@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
|
||||||
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
|
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
|
||||||
|
|
||||||
static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
|
static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
|
||||||
static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
|
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
|
||||||
|
|
||||||
void tick_broadcast(const struct cpumask *mask)
|
void tick_broadcast(const struct cpumask *mask)
|
||||||
{
|
{
|
||||||
atomic_t *count;
|
atomic_t *count;
|
||||||
struct call_single_data *csd;
|
call_single_data_t *csd;
|
||||||
int cpu;
|
int cpu;
|
||||||
|
|
||||||
for_each_cpu(cpu, mask) {
|
for_each_cpu(cpu, mask) {
|
||||||
|
@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
|
||||||
|
|
||||||
static int __init tick_broadcast_init(void)
|
static int __init tick_broadcast_init(void)
|
||||||
{
|
{
|
||||||
struct call_single_data *csd;
|
call_single_data_t *csd;
|
||||||
int cpu;
|
int cpu;
|
||||||
|
|
||||||
for (cpu = 0; cpu < NR_CPUS; cpu++) {
|
for (cpu = 0; cpu < NR_CPUS; cpu++) {
|
||||||
|
|
|
@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
|
||||||
static int raise_blk_irq(int cpu, struct request *rq)
|
static int raise_blk_irq(int cpu, struct request *rq)
|
||||||
{
|
{
|
||||||
if (cpu_online(cpu)) {
|
if (cpu_online(cpu)) {
|
||||||
struct call_single_data *data = &rq->csd;
|
call_single_data_t *data = &rq->csd;
|
||||||
|
|
||||||
data->func = trigger_softirq;
|
data->func = trigger_softirq;
|
||||||
data->info = rq;
|
data->info = rq;
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
struct nullb_cmd {
|
struct nullb_cmd {
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
struct llist_node ll_list;
|
struct llist_node ll_list;
|
||||||
struct call_single_data csd;
|
call_single_data_t csd;
|
||||||
struct request *rq;
|
struct request *rq;
|
||||||
struct bio *bio;
|
struct bio *bio;
|
||||||
unsigned int tag;
|
unsigned int tag;
|
||||||
|
|
|
@ -119,13 +119,13 @@ struct cpuidle_coupled {
|
||||||
|
|
||||||
#define CPUIDLE_COUPLED_NOT_IDLE (-1)
|
#define CPUIDLE_COUPLED_NOT_IDLE (-1)
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
|
static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The cpuidle_coupled_poke_pending mask is used to avoid calling
|
* The cpuidle_coupled_poke_pending mask is used to avoid calling
|
||||||
* __smp_call_function_single with the per cpu call_single_data struct already
|
* __smp_call_function_single with the per cpu call_single_data_t struct already
|
||||||
* in use. This prevents a deadlock where two cpus are waiting for each others
|
* in use. This prevents a deadlock where two cpus are waiting for each others
|
||||||
* call_single_data struct to be available
|
* call_single_data_t struct to be available
|
||||||
*/
|
*/
|
||||||
static cpumask_t cpuidle_coupled_poke_pending;
|
static cpumask_t cpuidle_coupled_poke_pending;
|
||||||
|
|
||||||
|
@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
|
||||||
*/
|
*/
|
||||||
static void cpuidle_coupled_poke(int cpu)
|
static void cpuidle_coupled_poke(int cpu)
|
||||||
{
|
{
|
||||||
struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
|
call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
|
||||||
|
|
||||||
if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
|
if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
|
||||||
smp_call_function_single_async(cpu, csd);
|
smp_call_function_single_async(cpu, csd);
|
||||||
|
@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
|
||||||
{
|
{
|
||||||
int cpu;
|
int cpu;
|
||||||
struct cpuidle_device *other_dev;
|
struct cpuidle_device *other_dev;
|
||||||
struct call_single_data *csd;
|
call_single_data_t *csd;
|
||||||
struct cpuidle_coupled *coupled;
|
struct cpuidle_coupled *coupled;
|
||||||
|
|
||||||
if (cpumask_empty(&dev->coupled_cpus))
|
if (cpumask_empty(&dev->coupled_cpus))
|
||||||
|
|
|
@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
|
||||||
if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
|
if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
|
||||||
napi_schedule_irqoff(&droq->napi);
|
napi_schedule_irqoff(&droq->napi);
|
||||||
} else {
|
} else {
|
||||||
struct call_single_data *csd = &droq->csd;
|
call_single_data_t *csd = &droq->csd;
|
||||||
|
|
||||||
csd->func = napi_schedule_wrapper;
|
csd->func = napi_schedule_wrapper;
|
||||||
csd->info = &droq->napi;
|
csd->info = &droq->napi;
|
||||||
|
|
|
@ -328,7 +328,7 @@ struct octeon_droq {
|
||||||
|
|
||||||
u32 cpu_id;
|
u32 cpu_id;
|
||||||
|
|
||||||
struct call_single_data csd;
|
call_single_data_t csd;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define OCT_DROQ_SIZE (sizeof(struct octeon_droq))
|
#define OCT_DROQ_SIZE (sizeof(struct octeon_droq))
|
||||||
|
|
|
@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
|
||||||
struct request {
|
struct request {
|
||||||
struct list_head queuelist;
|
struct list_head queuelist;
|
||||||
union {
|
union {
|
||||||
struct call_single_data csd;
|
call_single_data_t csd;
|
||||||
u64 fifo_time;
|
u64 fifo_time;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -2774,7 +2774,7 @@ struct softnet_data {
|
||||||
unsigned int input_queue_head ____cacheline_aligned_in_smp;
|
unsigned int input_queue_head ____cacheline_aligned_in_smp;
|
||||||
|
|
||||||
/* Elements below can be accessed between CPUs for RPS/RFS */
|
/* Elements below can be accessed between CPUs for RPS/RFS */
|
||||||
struct call_single_data csd ____cacheline_aligned_in_smp;
|
call_single_data_t csd ____cacheline_aligned_in_smp;
|
||||||
struct softnet_data *rps_ipi_next;
|
struct softnet_data *rps_ipi_next;
|
||||||
unsigned int cpu;
|
unsigned int cpu;
|
||||||
unsigned int input_queue_tail;
|
unsigned int input_queue_tail;
|
||||||
|
|
|
@ -14,13 +14,17 @@
|
||||||
#include <linux/llist.h>
|
#include <linux/llist.h>
|
||||||
|
|
||||||
typedef void (*smp_call_func_t)(void *info);
|
typedef void (*smp_call_func_t)(void *info);
|
||||||
struct call_single_data {
|
struct __call_single_data {
|
||||||
struct llist_node llist;
|
struct llist_node llist;
|
||||||
smp_call_func_t func;
|
smp_call_func_t func;
|
||||||
void *info;
|
void *info;
|
||||||
unsigned int flags;
|
unsigned int flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
|
||||||
|
typedef struct __call_single_data call_single_data_t
|
||||||
|
__aligned(sizeof(struct __call_single_data));
|
||||||
|
|
||||||
/* total number of cpus in this system (may exceed NR_CPUS) */
|
/* total number of cpus in this system (may exceed NR_CPUS) */
|
||||||
extern unsigned int total_cpus;
|
extern unsigned int total_cpus;
|
||||||
|
|
||||||
|
@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
||||||
smp_call_func_t func, void *info, bool wait,
|
smp_call_func_t func, void *info, bool wait,
|
||||||
gfp_t gfp_flags);
|
gfp_t gfp_flags);
|
||||||
|
|
||||||
int smp_call_function_single_async(int cpu, struct call_single_data *csd);
|
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
|
|
||||||
|
|
|
@ -769,7 +769,7 @@ struct rq {
|
||||||
#ifdef CONFIG_SCHED_HRTICK
|
#ifdef CONFIG_SCHED_HRTICK
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
int hrtick_csd_pending;
|
int hrtick_csd_pending;
|
||||||
struct call_single_data hrtick_csd;
|
call_single_data_t hrtick_csd;
|
||||||
#endif
|
#endif
|
||||||
struct hrtimer hrtick_timer;
|
struct hrtimer hrtick_timer;
|
||||||
#endif
|
#endif
|
||||||
|
|
32
kernel/smp.c
32
kernel/smp.c
|
@ -28,7 +28,7 @@ enum {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct call_function_data {
|
struct call_function_data {
|
||||||
struct call_single_data __percpu *csd;
|
call_single_data_t __percpu *csd;
|
||||||
cpumask_var_t cpumask;
|
cpumask_var_t cpumask;
|
||||||
cpumask_var_t cpumask_ipi;
|
cpumask_var_t cpumask_ipi;
|
||||||
};
|
};
|
||||||
|
@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
|
||||||
free_cpumask_var(cfd->cpumask);
|
free_cpumask_var(cfd->cpumask);
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
cfd->csd = alloc_percpu(struct call_single_data);
|
cfd->csd = alloc_percpu(call_single_data_t);
|
||||||
if (!cfd->csd) {
|
if (!cfd->csd) {
|
||||||
free_cpumask_var(cfd->cpumask);
|
free_cpumask_var(cfd->cpumask);
|
||||||
free_cpumask_var(cfd->cpumask_ipi);
|
free_cpumask_var(cfd->cpumask_ipi);
|
||||||
|
@ -103,12 +103,12 @@ void __init call_function_init(void)
|
||||||
* previous function call. For multi-cpu calls its even more interesting
|
* previous function call. For multi-cpu calls its even more interesting
|
||||||
* as we'll have to ensure no other cpu is observing our csd.
|
* as we'll have to ensure no other cpu is observing our csd.
|
||||||
*/
|
*/
|
||||||
static __always_inline void csd_lock_wait(struct call_single_data *csd)
|
static __always_inline void csd_lock_wait(call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
|
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline void csd_lock(struct call_single_data *csd)
|
static __always_inline void csd_lock(call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
csd_lock_wait(csd);
|
csd_lock_wait(csd);
|
||||||
csd->flags |= CSD_FLAG_LOCK;
|
csd->flags |= CSD_FLAG_LOCK;
|
||||||
|
@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
|
||||||
/*
|
/*
|
||||||
* prevent CPU from reordering the above assignment
|
* prevent CPU from reordering the above assignment
|
||||||
* to ->flags with any subsequent assignments to other
|
* to ->flags with any subsequent assignments to other
|
||||||
* fields of the specified call_single_data structure:
|
* fields of the specified call_single_data_t structure:
|
||||||
*/
|
*/
|
||||||
smp_wmb();
|
smp_wmb();
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline void csd_unlock(struct call_single_data *csd)
|
static __always_inline void csd_unlock(call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
|
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
|
||||||
|
|
||||||
|
@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
|
||||||
smp_store_release(&csd->flags, 0);
|
smp_store_release(&csd->flags, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
|
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Insert a previously allocated call_single_data element
|
* Insert a previously allocated call_single_data_t element
|
||||||
* for execution on the given CPU. data must already have
|
* for execution on the given CPU. data must already have
|
||||||
* ->func, ->info, and ->flags set.
|
* ->func, ->info, and ->flags set.
|
||||||
*/
|
*/
|
||||||
static int generic_exec_single(int cpu, struct call_single_data *csd,
|
static int generic_exec_single(int cpu, call_single_data_t *csd,
|
||||||
smp_call_func_t func, void *info)
|
smp_call_func_t func, void *info)
|
||||||
{
|
{
|
||||||
if (cpu == smp_processor_id()) {
|
if (cpu == smp_processor_id()) {
|
||||||
|
@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||||
{
|
{
|
||||||
struct llist_head *head;
|
struct llist_head *head;
|
||||||
struct llist_node *entry;
|
struct llist_node *entry;
|
||||||
struct call_single_data *csd, *csd_next;
|
call_single_data_t *csd, *csd_next;
|
||||||
static bool warned;
|
static bool warned;
|
||||||
|
|
||||||
WARN_ON(!irqs_disabled());
|
WARN_ON(!irqs_disabled());
|
||||||
|
@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||||
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
||||||
int wait)
|
int wait)
|
||||||
{
|
{
|
||||||
struct call_single_data *csd;
|
call_single_data_t *csd;
|
||||||
struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
|
call_single_data_t csd_stack = {
|
||||||
|
.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
|
||||||
|
};
|
||||||
int this_cpu;
|
int this_cpu;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
|
||||||
* NOTE: Be careful, there is unfortunately no current debugging facility to
|
* NOTE: Be careful, there is unfortunately no current debugging facility to
|
||||||
* validate the correctness of this serialization.
|
* validate the correctness of this serialization.
|
||||||
*/
|
*/
|
||||||
int smp_call_function_single_async(int cpu, struct call_single_data *csd)
|
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
|
@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
|
||||||
|
|
||||||
cpumask_clear(cfd->cpumask_ipi);
|
cpumask_clear(cfd->cpumask_ipi);
|
||||||
for_each_cpu(cpu, cfd->cpumask) {
|
for_each_cpu(cpu, cfd->cpumask) {
|
||||||
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
|
call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
|
||||||
|
|
||||||
csd_lock(csd);
|
csd_lock(csd);
|
||||||
if (wait)
|
if (wait)
|
||||||
|
@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
|
||||||
|
|
||||||
if (wait) {
|
if (wait) {
|
||||||
for_each_cpu(cpu, cfd->cpumask) {
|
for_each_cpu(cpu, cfd->cpumask) {
|
||||||
struct call_single_data *csd;
|
call_single_data_t *csd;
|
||||||
|
|
||||||
csd = per_cpu_ptr(cfd->csd, cpu);
|
csd = per_cpu_ptr(cfd->csd, cpu);
|
||||||
csd_lock_wait(csd);
|
csd_lock_wait(csd);
|
||||||
|
|
|
@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(smp_call_function_single);
|
EXPORT_SYMBOL(smp_call_function_single);
|
||||||
|
|
||||||
int smp_call_function_single_async(int cpu, struct call_single_data *csd)
|
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue