From 2a67e741bbbc022e0fadf8c6dbc3a76019ecd0cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2015 12:24:23 +0200 Subject: [PATCH] rcu: Create transitive rnp->lock acquisition functions Providing RCU's memory-ordering guarantees requires that the rcu_node tree's locking provide transitive memory ordering, which the Linux kernel's spinlocks currently do not provide unless smp_mb__after_unlock_lock() is used. Having a separate smp_mb__after_unlock_lock() after each and every lock acquisition is error-prone, hard to read, and a bit annoying, so this commit provides wrapper functions that pull in the smp_mb__after_unlock_lock() invocations. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 78 ++++++++++++++-------------------------- kernel/rcu/tree.h | 39 ++++++++++++++++++++ kernel/rcu/tree_plugin.h | 18 ++++------ 3 files changed, 71 insertions(+), 64 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..daf17e248757 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1534,10 +1534,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1786,11 +1784,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1814,8 +1811,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1847,8 +1843,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1973,8 +1967,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1993,8 +1986,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2011,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2035,8 +2026,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2284,8 +2274,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2321,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2343,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2569,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2597,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2794,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2865,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -3005,8 +2988,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3426,8 +3408,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3428,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3452,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3531,8 +3510,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3527,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3541,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3708,8 +3684,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -4198,8 +4173,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..f32bebb6bc90 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -664,3 +664,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..fa0e3b96a9ed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,8 +301,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -457,8 +456,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; WARN_ON_ONCE(1); @@ -989,8 +987,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1173,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1567,8 +1563,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -2068,8 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake)