From 28728dd310d48834cd486dac3cac9ae96b9deb96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 08:33:37 -0800 Subject: [PATCH 01/45] rcu: Make expedited RCU-sched grace period immediately detect idle Currently, sync_sched_exp_handler() will force a reschedule unless this CPU has already checked in or unless a reschedule has already been called for. This is clearly wasteful if sync_sched_exp_handler() interrupted an idle CPU, so this commit immediately reports the quiescent state in that case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bd..5f4336fadc28 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3649,6 +3649,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } From 251c617c75f48e03523c43c4ce1dff44bc3ae2bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Jan 2016 10:52:35 -0800 Subject: [PATCH 02/45] rcu: Make expedited RCU-preempt stall warnings count accurately Currently, synchronize_sched_expedited_wait() simply sets the ndetected variable to the rcu_print_task_exp_stall() return value. This means that if the last rcu_node structure has no stalled tasks, record of any stalled tasks in previous rcu_node structures is lost, which can in turn result in failure to dump out the blocking rcu_node structures. Or could, had the test been correct. This commit therefore adds the return value of rcu_print_task_exp_stall() to ndetected and corrects the later test for ndetected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f4336fadc28..687d8a5f35c7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3778,7 +3778,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3797,7 +3797,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) From a1e1224849d9610b50fd1dd7d6f44308a59e46af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Jan 2016 13:57:54 -0800 Subject: [PATCH 03/45] rcu: Make cond_resched_rcu_qs() supply RCU-sched expedited QS Although cond_resched_rcu_qs() supplies quiescent states to all flavors of normal RCU grace periods, it does nothing for expedited RCU-sched grace periods. This commit therefore adds a check for a need for a quiescent state from the current CPU by an expedited RCU-sched grace period, and invokes rcu_sched_qs() to supply that quiescent state if so. Note that the check is racy in that we might be migrated to some other CPU just after checking the per-CPU variable. This is OK because the act of migration will do a context switch, which will supply the needed quiescent state. The only downside is that we might do an unnecessary call to rcu_sched_qs(), but the probability is low and the overhead is small. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 687d8a5f35c7..178575c01d09 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -370,6 +370,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } From 26ece8ef6eca97f19eb5ad5186b8c1a29ab25d76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 18:48:37 -0800 Subject: [PATCH 04/45] rcu: Fix synchronize_rcu_expedited() header comment This commit brings the synchronize_rcu_expedited() function's header comment into line with the new implementation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce12..a2ac2628ef8e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -722,13 +722,19 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { From e087816db9423fdc49302d3cd7ec01e487477a71 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:25:09 -0800 Subject: [PATCH 05/45] rcu: Add event tracing definitions for expedited grace periods Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 78 +++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ef72c4aada56..aacc172eba7e 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -171,6 +171,76 @@ TRACE_EVENT(rcu_grace_period_init, __entry->grplo, __entry->grphi, __entry->qsmask) ); +/* + * Tracepoint for expedited grace-period events. Takes a string identifying + * the RCU flavor, the expedited grace-period sequence number, and a string + * identifying the grace-period-related event as follows: + * + * "snap": Captured snapshot of expedited grace period sequence number. + * "start": Started a real expedited grace period. + * "end": Ended a real expedited grace period. + * "done": Someone else did the expedited grace period for us. + */ +TRACE_EVENT(rcu_exp_grace_period, + + TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent), + + TP_ARGS(rcuname, gpseq, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, gpseq) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->gpseq = gpseq; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %lu %s", + __entry->rcuname, __entry->gpseq, __entry->gpevent) +); + +/* + * Tracepoint for expedited grace-period funnel-locking events. Takes a + * string identifying the RCU flavor, an integer identifying the rcu_node + * combining-tree level, another pair of integers identifying the lowest- + * and highest-numbered CPU associated with the current rcu_node structure, + * and a string. identifying the grace-period-related event as follows: + * + * "acq": Acquired a level of funnel lock + * "rel": Released a level of funnel lock + */ +TRACE_EVENT(rcu_exp_funnel_lock, + + TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi, + const char *gpevent), + + TP_ARGS(rcuname, level, grplo, grphi, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(u8, level) + __field(int, grplo) + __field(int, grphi) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->level = level; + __entry->grplo = grplo; + __entry->grphi = grphi; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %d %d %d %s", + __entry->rcuname, __entry->level, __entry->grplo, + __entry->grphi, __entry->gpevent) +); + /* * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended * to assist debugging of these handoffs. @@ -704,11 +774,15 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ - qsmask) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) +#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \ + do { } while (0) +#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ + do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) From bea2de44ae647698dc848a671fdee6e53c192423 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:30:06 -0800 Subject: [PATCH 06/45] rcu: Add funnel-locking tracing for expedited grace periods Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++++++---- kernel/rcu/tree_plugin.h | 3 +++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 178575c01d09..79e9206a7b11 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,10 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) + if (rnp) { mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("rel")); + } else if (rdp) { mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, + rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, + TPS("rel")); + } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3619,6 +3627,9 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (sync_exp_work_done(rsp, rnp0, NULL, &rdp->expedited_workdone0, s)) return NULL; + trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, + rnp0->grplo, rnp0->grphi, + TPS("acq")); return rnp0; } } @@ -3634,16 +3645,28 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, TPS("acq")); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) + trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, + rnp0->grplo, rnp0->grphi, TPS("acq")); + if (rnp1) { mutex_unlock(&rnp1->exp_funnel_mutex); - else + trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, + rnp1->grplo, rnp1->grphi, + TPS("rel")); + } else { mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, + rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, + TPS("rel")); + } rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a2ac2628ef8e..cd2dae43ff48 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -767,6 +767,9 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp_unlock->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, + rnp_unlock->grplo, rnp_unlock->grphi, + TPS("rel")); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 4f41530245c7fd4837152e264d120d05ae940eb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:49:49 -0800 Subject: [PATCH 07/45] rcu: Add expedited-grace-period event tracing Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++------- kernel/rcu/tree_plugin.h | 3 +++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79e9206a7b11..524026fd9dd7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,17 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); if (rnp) { - mutex_unlock(&rnp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("rel")); + mutex_unlock(&rnp->exp_funnel_mutex); } else if (rdp) { - mutex_unlock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("rel")); + mutex_unlock(&rdp->exp_funnel_mutex); } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ @@ -3624,12 +3625,12 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp0 = rcu_get_root(rsp); if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, rnp0->grplo, rnp0->grphi, TPS("acq")); + if (sync_exp_work_done(rsp, rnp0, NULL, + &rdp->expedited_workdone0, s)) + return NULL; return rnp0; } } @@ -3656,16 +3657,16 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, rnp0->grplo, rnp0->grphi, TPS("acq")); if (rnp1) { - mutex_unlock(&rnp1->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, rnp1->grplo, rnp1->grphi, TPS("rel")); + mutex_unlock(&rnp1->exp_funnel_mutex); } else { - mutex_unlock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("rel")); + mutex_unlock(&rdp->exp_funnel_mutex); } rnp1 = rnp0; } @@ -3895,16 +3896,21 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); rnp = exp_funnel_lock(rsp, s); if (rnp == NULL) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, TPS("rel")); mutex_unlock(&rnp->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cd2dae43ff48..36e94aed38a7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -750,12 +750,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); rnp_unlock = exp_funnel_lock(rsp, s); if (rnp_unlock == NULL) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); @@ -766,6 +768,7 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); mutex_unlock(&rnp_unlock->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, rnp_unlock->grplo, rnp_unlock->grphi, From e2fd9d35847d1936398d44c4df68dceb3d7f64e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 17:23:19 -0800 Subject: [PATCH 08/45] rcu: Remove expedited GP funnel-lock bypass Commit #cdacbe1f91264 ("rcu: Add fastpath bypassing funnel locking") turns out to be a pessimization at high load because it forces a tree full of tasks to wait for an expedited grace period that they probably do not need. This commit therefore removes this optimization. Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 10 +++++----- kernel/rcu/tree.c | 19 ------------------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_trace.c | 7 +++---- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index ec6998b1b6d0..00a3a38b375a 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: o "s" is the sequence number, with an odd number indicating that an expedited grace period is in progress. -o "wd0", "wd1", "wd2", and "wd3" are the number of times that an - attempt to start an expedited grace period found that someone - else had completed an expedited grace period that satisfies the - attempted request. "Our work is done." +o "wd1", "wd2", and "wd3" are the number of times that an attempt + to start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the attempted + request. "Our work is done." o "n" is number of times that a concurrent CPU-hotplug operation forced a fallback to a normal grace period. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 524026fd9dd7..62e73e0a929f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3616,25 +3616,6 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; - /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. - */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, - rnp0->grplo, rnp0->grphi, - TPS("acq")); - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; - } - } - /* * Each pass through the following loop works its way * up the rcu_node tree, returning if others have done the diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..ac9a7b0c36ae 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -388,7 +388,6 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..d149c412a4e5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); s1 += atomic_long_read(&rdp->expedited_workdone1); s2 += atomic_long_read(&rdp->expedited_workdone2); s3 += atomic_long_read(&rdp->expedited_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); From ec3833ed02ae6ef2a933ece9de7cbab0c64c699e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jan 2016 16:29:29 -0800 Subject: [PATCH 09/45] rcu: Force boolean subscript for expedited stall warnings The cpu_online() function can return values other than 0 and 1, which can result in subscript overflow when applied to a two-element array. This commit allows for this behavior by using "!!" on the return value from cpu_online() when used as a subscript. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62e73e0a929f..64c2e3288551 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3808,7 +3808,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } From d40a4f09a448382961fa9b1a2f7d4f34813f0273 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2016 14:43:44 -0800 Subject: [PATCH 10/45] rcu: Shorten expedited_workdone* to exp_workdone* Just a name change to save a few lines and a bit of typing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++----- kernel/rcu/tree.h | 6 +++--- kernel/rcu/tree_trace.c | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 64c2e3288551..89f028767765 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3624,15 +3624,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("acq")); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, @@ -3651,8 +3650,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } rnp1 = rnp0; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s)) return NULL; return rnp1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ac9a7b0c36ae..6a8f09446924 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -388,9 +388,9 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d149c412a4e5..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -189,9 +189,9 @@ static int show_rcuexp(struct seq_file *m, void *v) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s1, s2, s3, From f6a12f34a448cc8a624070fd365c29c890138a48 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 17:57:35 -0800 Subject: [PATCH 11/45] rcu: Enforce expedited-GP fairness via funnel wait queue The current mutex-based funnel-locking approach used by expedited grace periods is subject to severe unfairness. The problem arises when a few tasks, making a path from leaves to root, all wake up before other tasks do. A new task can then follow this path all the way to the root, which needlessly delays tasks whose grace period is done, but who do not happen to acquire the lock quickly enough. This commit avoids this problem by maintaining per-rcu_node wait queues, along with a per-rcu_node counter that tracks the latest grace period sought by an earlier task to visit this node. If that grace period would satisfy the current task, instead of proceeding up the tree, it waits on the current rcu_node structure using a pair of wait queues provided for that purpose. This decouples awakening of old tasks from the arrival of new tasks. If the wakeups prove to be a bottleneck, additional kthreads can be brought to bear for that purpose. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 5 +- kernel/rcu/tree.c | 157 +++++++++++++++++++------------------ kernel/rcu/tree.h | 10 +-- kernel/rcu/tree_plugin.h | 16 ++-- 4 files changed, 94 insertions(+), 94 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index aacc172eba7e..d3e756539d44 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -179,6 +179,7 @@ TRACE_EVENT(rcu_grace_period_init, * "snap": Captured snapshot of expedited grace period sequence number. * "start": Started a real expedited grace period. * "end": Ended a real expedited grace period. + * "endwake": Woke piggybackers up. * "done": Someone else did the expedited grace period for us. */ TRACE_EVENT(rcu_exp_grace_period, @@ -210,8 +211,8 @@ TRACE_EVENT(rcu_exp_grace_period, * and highest-numbered CPU associated with the current rcu_node structure, * and a string. identifying the grace-period-related event as follows: * - * "acq": Acquired a level of funnel lock - * "rel": Released a level of funnel lock + * "nxtlvl": Advance to next level of rcu_node funnel + * "wait": Wait for someone else to do expedited GP */ TRACE_EVENT(rcu_exp_funnel_lock, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 89f028767765..bd2658edce00 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,7 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -3484,7 +3485,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3500,8 +3501,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3538,7 +3539,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3551,8 +3552,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3570,7 +3571,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3579,24 +3579,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - if (rnp) { - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("rel")); - mutex_unlock(&rnp->exp_funnel_mutex); - } else if (rdp) { - trace_rcu_exp_funnel_lock(rsp->name, - rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, - TPS("rel")); - mutex_unlock(&rdp->exp_funnel_mutex); - } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3606,53 +3593,53 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, TPS("acq")); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, - rnp0->grplo, rnp0->grphi, TPS("acq")); - if (rnp1) { - trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, - rnp1->grplo, rnp1->grphi, - TPS("rel")); - mutex_unlock(&rnp1->exp_funnel_mutex); - } else { - trace_rcu_exp_funnel_lock(rsp->name, - rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, - TPS("rel")); - mutex_unlock(&rdp->exp_funnel_mutex); + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x1], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } - rnp1 = rnp0; + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s)) - return NULL; - return rnp1; + mutex_lock(&rsp->exp_mutex); + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3841,6 +3828,27 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); + } +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3860,7 +3868,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3877,20 +3884,23 @@ void synchronize_sched_expedited(void) s = rcu_exp_gp_seq_snap(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, TPS("rel")); - mutex_unlock(&rnp->exp_funnel_mutex); + rcu_exp_wake(rsp, s); + + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4190,7 +4200,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4448,10 +4457,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4510,9 +4517,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a8f09446924..f9d4fbb1e014 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,7 +248,9 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[2]; } ____cacheline_internodealigned_in_smp; /* @@ -387,7 +385,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ @@ -504,6 +501,7 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 36e94aed38a7..c82c3640493f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -738,8 +738,6 @@ static void sync_rcu_exp_handler(void *info) */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -752,8 +750,7 @@ void synchronize_rcu_expedited(void) s = rcu_exp_gp_seq_snap(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); @@ -763,16 +760,13 @@ void synchronize_rcu_expedited(void) sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, - rnp_unlock->grplo, rnp_unlock->grphi, - TPS("rel")); + rcu_exp_wake(rsp, s); + + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 356051e1de3cf65575da4ee92d1f5cee86677ee2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 13:22:53 -0700 Subject: [PATCH 12/45] rcu: Add exp_funnel_lock() fastpath This commit speeds up the low-contention case, especially for systems with large rcu_node trees, by attempting to directly acquire the ->exp_mutex. This fastpath checks the leaves and root first in order to avoid excessive memory contention on the mutex itself. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd2658edce00..892a140ae7b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3603,6 +3603,15 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* * Each pass through the following loop works its way up @@ -3635,6 +3644,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grphi, TPS("nxtlvl")); } mutex_lock(&rsp->exp_mutex); +fastpath: if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { mutex_unlock(&rsp->exp_mutex); return true; From 4ea3e85b113ab37a2d55cfabf0d709ddec088bb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:22:25 -0700 Subject: [PATCH 13/45] rcu: Consolidate expedited GP code into rcu_exp_wait_wake() Currently, synchronize_rcu_expedited() and rcu_sched_expedited() have significant duplicate code. This commit therefore consolidates some of this code into rcu_exp_wake(), which is now renamed to rcu_exp_wait_wake() in recognition of its added responsibilities. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- kernel/rcu/tree_plugin.h | 10 ++-------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 892a140ae7b6..fd86eca9478e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3839,14 +3839,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } /* - * Wake up everyone who piggybacked on the just-completed expedited + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited * grace period. Also update all the ->exp_seq_rq counters as needed * in order to avoid counter-wrap problems. */ -static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) { struct rcu_node *rnp; + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); @@ -3857,6 +3861,8 @@ static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) } wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } /** @@ -3904,13 +3910,7 @@ void synchronize_sched_expedited(void) sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); /* Wait and clean up, including waking everyone. */ - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - rcu_exp_wake(rsp, s); - - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c82c3640493f..b6d5dde6eab9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -759,14 +759,8 @@ void synchronize_rcu_expedited(void) /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - rcu_exp_wake(rsp, s); - - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 179e5dcd1e5bdfac1128431d131b31322aedd2bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:27:44 -0700 Subject: [PATCH 14/45] rcu: Consolidate expedited GP tracing into rcu_exp_gp_seq_snap() This commit moves some duplicate code from synchronize_rcu_expedited() and synchronize_sched_expedited() into rcu_exp_gp_seq_snap(). This doesn't save lines of code, but does eliminate a "tell me twice" issue. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- kernel/rcu/tree_plugin.h | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fd86eca9478e..5b1c8fd89af0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3392,8 +3392,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3898,8 +3902,6 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b6d5dde6eab9..529a44085a63 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -748,8 +748,6 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ From aff12cdf86e6fa891d1c30c0fad112d138bd7b10 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:32:24 -0700 Subject: [PATCH 15/45] rcu: Consolidate expedited GP code into exp_funnel_lock() This commit pulls the grace-period-start counter adjustment and tracing from synchronize_rcu_expedited() and synchronize_sched_expedited() into exp_funnel_lock(), thus eliminating some code duplication. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++--- kernel/rcu/tree_plugin.h | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5b1c8fd89af0..e8fff14e417b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3653,6 +3653,8 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_mutex); return true; } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); return false; } @@ -3905,9 +3907,6 @@ void synchronize_sched_expedited(void) if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 529a44085a63..ff1cd4e1188d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -751,9 +751,6 @@ void synchronize_rcu_expedited(void) if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); From 3b5f668e715bc19610ad967ef97a7e8c55a186ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:47:55 -0700 Subject: [PATCH 16/45] rcu: Overlap wakeups with next expedited grace period The current expedited grace-period implementation makes subsequent grace periods wait on wakeups for the prior grace period. This does not fit the dictionary definition of "expedited", so this commit allows these two phases to overlap. Doing this requires four waitqueues rather than two because tasks can now be waiting on the previous, current, and next grace periods. The fourth waitqueue makes the bit masking work out nicely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++++++--- kernel/rcu/tree.h | 3 ++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e8fff14e417b..1df100cb7a62 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -3637,7 +3638,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x1], + wait_event(rnp->exp_wq[(s >> 1) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -3857,6 +3858,14 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); @@ -3865,10 +3874,10 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + mutex_unlock(&rsp->exp_wake_mutex); } /** @@ -4530,6 +4539,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rcu_init_one_nocb(rnp); init_waitqueue_head(&rnp->exp_wq[0]); init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f9d4fbb1e014..1194ab0da56a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -250,7 +250,7 @@ struct rcu_node { spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; - wait_queue_head_t exp_wq[2]; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -502,6 +502,7 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ From 86057b80ae31d37fcbdb5f57d15aaf1148c69f96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 08:48:36 -0800 Subject: [PATCH 17/45] rcu: Awaken grace-period kthread when stalled Recent kernels can fail to awaken the grace-period kthread for quiescent-state forcing. This commit is a crude hack that does a wakeup any time a stall is detected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bd..a327a253c178 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1224,8 +1224,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } From fcfd0a237bfcf0c314005007e9d76e55a25e2bad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Jan 2016 16:42:18 -0800 Subject: [PATCH 18/45] rcu: Make FQS schedule advance only if FQS happened Currently, the force-quiescent-state (FQS) code in rcu_gp_kthread() can advance the next FQS even if one was not executed last time. This can happen due timeout-duration uncertainty. This commit therefore avoids advancing the FQS schedule unless an FQS was just executed. In the corner case where an FQS was not executed, but is due now, the code does a one-jiffy wait. This change prepares for kthread kicking. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a327a253c178..6116cfad18ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2146,6 +2146,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2154,14 +2163,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } From 8c7c4829a81c1838f18c12ce5a3a5c29a08bf0a8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Jan 2016 20:29:57 -0800 Subject: [PATCH 19/45] rcu: Awaken grace-period kthread if too long since FQS Recent kernels can fail to awaken the grace-period kthread for quiescent-state forcing. This commit is a crude hack that does a wakeup if a scheduling-clock interrupt sees that it has been too long since force-quiescent-state (FQS) processing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6116cfad18ff..a739292be605 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -385,9 +385,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -1251,6 +1253,24 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1262,6 +1282,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1335,6 +1360,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1379,8 +1409,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -2119,8 +2151,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..34d3973f7223 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -513,6 +513,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ From 293e2421fe25839500207eda123cc4475f8d17b8 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 23 Mar 2016 23:11:48 +0800 Subject: [PATCH 20/45] rcu: Remove superfluous versions of rcu_read_lock_sched_held() Currently, we have four versions of rcu_read_lock_sched_held(), depending on the combined choices on PREEMPT_COUNT and DEBUG_LOCK_ALLOC. However, there is an existing function preemptible() that already distinguishes between the PREEMPT_COUNT=y and PREEMPT_COUNT=n cases, and allows these four implementations to be consolidated down to two. This commit therefore uses preemptible() to achieve this consolidation. Note that there could be a small performance regression in the case of CONFIG_DEBUG_LOCK_ALLOC=y && PREEMPT_COUNT=n. However, given the overhead associated with CONFIG_DEBUG_LOCK_ALLOC=y, this should be down in the noise. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 17 +---------------- kernel/rcu/update.c | 4 ++-- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 45de591657a6..5f1533e3d032 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void); * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. */ -#ifdef CONFIG_PREEMPT_COUNT int rcu_read_lock_sched_held(void); -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { - return preempt_count() != 0 || irqs_disabled(); + return !preemptible(); } -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938..3ccdc8eebc5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif From 5dffed1e5721f6deae4fd67d32386ef037c5fc56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2016 11:54:28 -0800 Subject: [PATCH 21/45] rcu: Dump ftrace buffer when kicking grace-period kthread If it is necessary to kick the grace-period kthread, that is a good time to dump the trace buffer in order to learn why kicking was needed. This commit therefore does the dump. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a739292be605..86edb92276d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1266,6 +1266,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) j = READ_ONCE(rsp->jiffies_kick_kthreads); if (time_after(jiffies, j) && rsp->gp_kthread) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); } From fd35be623a1534bde57029c429b206d6c22a1ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jan 2016 13:13:12 -0800 Subject: [PATCH 22/45] rcutorture: Update scripting to accommodate rcuperf This commit adds the scripting changes to add support for the shiny new rcuperf kernel module. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-rcuperf.sh | 82 +++++++++++++++++++ .../selftests/rcutorture/bin/kvm-recheck.sh | 5 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- .../rcutorture/configs/rcuperf/CFLIST | 1 + .../rcutorture/configs/rcuperf/CFcommon | 2 + .../selftests/rcutorture/configs/rcuperf/TREE | 19 +++++ .../configs/rcuperf/ver_functions.sh | 52 ++++++++++++ 7 files changed, 161 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh new file mode 100755 index 000000000000..e5b28174fda0 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements. +# +# Usage: kvm-recheck-rcuperf.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +if test -d $i +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +. tools/testing/selftests/rcutorture/bin/functions.sh + +configfile=`echo $i | sed -e 's/^.*\///'` + +grep -e '-perf:.*writer-duration' $i/console.log | sed -e 's/^\[[^]]*]//' | +awk ' +{ + gptimes[++n] = $5 / 1000.; + sum += $5 / 1000.; +} + +END { + if (NR <= 0) { + print "No rcuperf records found???" + exit; + } + asort(gptimes); + pct50 = int(NR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(NR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(NR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= NR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Average grace-period duration: " sum / NR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[NR]; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index d86bdd6b6cc2..f659346d3358 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -48,7 +48,10 @@ do cat $i/Make.oldconfig.err fi parse-build.sh $i/Make.out $configfile - parse-torture.sh $i/console.log $configfile + if test "$TORTURE_SUITE" != rcuperf + then + parse-torture.sh $i/console.log $configfile + fi parse-console.sh $i/console.log $configfile if test -r $i/Warnings then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 4a431767f77a..c33cb582b3dc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -156,7 +156,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' TORTURE_SUITE=$2 shift ;; diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST new file mode 100644 index 000000000000..c9f56cf20775 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST @@ -0,0 +1 @@ +TREE diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon new file mode 100644 index 000000000000..a09816b8c0f3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_PERF_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE new file mode 100644 index 000000000000..614e107f6db5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -0,0 +1,19 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh new file mode 100644 index 000000000000..34f2a1b35ee5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney + +# rcuperf_param_nreaders bootparam-string +# +# Adds nreaders rcuperf module parameter if not already specified. +rcuperf_param_nreaders () { + if ! echo "$1" | grep -q "rcuperf.nreaders" + then + echo rcuperf.nreaders=-1 + fi +} + +# rcuperf_param_nwriters bootparam-string +# +# Adds nwriters rcuperf module parameter if not already specified. +rcuperf_param_nwriters () { + if ! echo "$1" | grep -q "rcuperf.nwriters" + then + echo rcuperf.nwriters=-1 + fi +} + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 `rcuperf_param_nreaders "$1"` \ + `rcuperf_param_nwriters "$1"` \ + rcuperf.perf_runnable=1 \ + rcuperf.shutdown=1 \ + rcuperf.verbose=1 +} From 9efafb8849f732a3497f46f178b350c9ff7cfe27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 18:11:47 -0800 Subject: [PATCH 23/45] rcutorture: Allow for rcupdate.rcu_normal Currently, rcu_torture_writer() checks only for rcu_gp_is_expedited() when deciding whether or not to do dynamic control of RCU expediting. This means that if rcupdate.rcu_normal is specified, rcu_torture_writer() will attempt to dynamically control RCU expediting, but will nonetheless only test normal RCU grace periods. This commit therefore adds a check for !rcu_gp_is_normal(), and prints a message and desists from testing dynamic control of RCU expediting when doing so is futile. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 463867c43221..9234e75b106a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +932,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", From 291783b8ad77a83a6fdf91d55eee7f1ad72ed4d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 13:43:30 -0800 Subject: [PATCH 24/45] rcutorture: Expedited-GP batch progress access to torturing This commit provides rcu_exp_batches_completed() and rcu_exp_batches_completed_sched() functions to allow torture-test modules to check how many expedited grace period batches have completed. These are analogous to the existing rcu_batches_completed(), rcu_batches_completed_bh(), and rcu_batches_completed_sched() functions. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 16 ++++++++++++++++ include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 64809aea661c..93aea75029fb 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void) return 0; } +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index ad1eda9fa4da..5043cb823fb2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bd..88df64087dfe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -459,6 +459,28 @@ unsigned long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + /* * Force a quiescent state. */ From 8704baab9bc848b58c129fed6b591bb84ec02f41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 18:33:22 -0800 Subject: [PATCH 25/45] rcutorture: Add RCU grace-period performance tests This commit adds a new rcuperf module that carries out simple performance tests of RCU grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 1 + kernel/rcu/rcuperf.c | 637 +++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 33 +++ 3 files changed, 671 insertions(+) create mode 100644 kernel/rcu/rcuperf.c diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..9d54a57bee7d --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,637 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, NULL, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e9a607534ca..f4b797a690ba 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,6 +1289,39 @@ config TORTURE_TEST tristate default n +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_PERF_TEST_RUNNABLE + bool "performance tests for RCU runnable by default" + depends on RCU_PERF_TEST = y + default n + help + This option provides a way to build the RCU performance tests + directly into the kernel without them starting up at boot time. + You can use /sys/module to manually override this setting. + This /proc file is available only when the RCU performance + tests have been built into the kernel. + + Say Y here if you want the RCU performance tests to start during + boot (you probably don't). + Say N here if you want the RCU performance tests to start only + after being manually enabled via /sys/module. + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL From bdea9e347783c2724997db7c5d5b45a301e2dc90 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Jan 2016 13:47:19 -0800 Subject: [PATCH 26/45] rcutorture: Documentation for rcuperf kernel parameters This commit adds documentation for the new rcuperf module's kernel boot parameters. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ecc74fa4bfde..951af481da5a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3284,6 +3284,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcuperf.gp_exp= [KNL] + Measure performance of expedited synchronous + grace-period primitives. + + rcuperf.nreaders= [KNL] + Set number of RCU readers. The value -1 selects + N, where N is the number of CPUs. A value + "n" less than -1 selects N-n+1, where N is again + the number of CPUs. For example, -2 selects N + (the number of CPUs), -3 selects N+1, and so on. + A value of "n" less than or equal to -N selects + a single reader. + + rcuperf.nwriters= [KNL] + Set number of RCU writers. The values operate + the same as for rcuperf.nreaders. + N, where N is the number of CPUs + + rcuperf.perf_runnable= [BOOT] + Start rcuperf running at boot time. + + rcuperf.shutdown= [KNL] + Shut the system down after performance tests + complete. This is useful for hands-off automated + testing. + + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + + rcuperf.verbose= [KNL] + Enable additional printk() statements. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. From 6b558c4c7a4ba410e39dbcb9d4c2b6e928c09308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 14:15:40 -0800 Subject: [PATCH 27/45] rcutorture: Bind rcuperf reader/writer kthreads to CPUs This commit forces more deterministic behavior by binding rcuperf's rcu_perf_reader() and rcu_perf_writer() kthreads to their respective CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 9d54a57bee7d..7a1edf417d18 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -328,8 +328,10 @@ rcu_perf_reader(void *arg) { unsigned long flags; int idx; + long me = (long)arg; VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_rcu_perf_reader_started); @@ -362,6 +364,7 @@ rcu_perf_writer(void *arg) WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; @@ -594,7 +597,7 @@ rcu_perf_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, NULL, + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; From 2094c99558d9e9374210898f65f5862f7a2e8bed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 15:17:21 -0800 Subject: [PATCH 28/45] rcutorture: Set rcuperf writer kthreads to real-time priority This commit forces more deterministic update-side behavior by setting rcuperf's rcu_perf_writer() kthreads to real-time priority. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a1edf417d18..e18d016a9888 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -355,6 +355,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -365,6 +366,8 @@ rcu_perf_writer(void *arg) WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; From e588f35492227cc4ab2cbfe95fd5f993a5086f9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 17:26:35 -0800 Subject: [PATCH 29/45] rcutorture: Print measure of batching efficiency This commit adds a line giving the number of grace periods, the number of batches, and the ratio. The larger the ratio, the greater the batching efficiency. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-rcuperf.sh | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index e5b28174fda0..1f72df8eedc7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -34,33 +34,38 @@ fi configfile=`echo $i | sed -e 's/^.*\///'` -grep -e '-perf:.*writer-duration' $i/console.log | sed -e 's/^\[[^]]*]//' | +sed -e 's/^\[[^]]*]//' < $i/console.log | awk ' -{ +/-perf: .* gps: .* batches:/ { + ngps = $9; + nbatches = $11; +} + +/-perf: .*writer-duration/ { gptimes[++n] = $5 / 1000.; sum += $5 / 1000.; } END { - if (NR <= 0) { + newNR = asort(gptimes); + if (newNR <= 0) { print "No rcuperf records found???" exit; } - asort(gptimes); - pct50 = int(NR * 50 / 100); + pct50 = int(newNR * 50 / 100); if (pct50 < 1) pct50 = 1; - pct90 = int(NR * 90 / 100); + pct90 = int(newNR * 90 / 100); if (pct90 < 1) pct90 = 1; - pct99 = int(NR * 99 / 100); + pct99 = int(newNR * 99 / 100); if (pct99 < 1) pct99 = 1; div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; print "Histogram bucket size: " div; last = gptimes[1] - 10; count = 0; - for (i = 1; i <= NR; i++) { + for (i = 1; i <= newNR; i++) { current = div * int(gptimes[i] / div); if (last == current) { count++; @@ -73,10 +78,11 @@ END { } if (count > 0) print last, count; - print "Average grace-period duration: " sum / NR " microseconds"; + print "Average grace-period duration: " sum / newNR " microseconds"; print "Minimum grace-period duration: " gptimes[1]; print "50th percentile grace-period duration: " gptimes[pct50]; print "90th percentile grace-period duration: " gptimes[pct90]; print "99th percentile grace-period duration: " gptimes[pct99]; - print "Maximum grace-period duration: " gptimes[NR]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; }' From ac2bb275e8e5abddb0815ff2b7aa383ed6d007a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 Jan 2016 14:58:17 -0800 Subject: [PATCH 30/45] rcutorture: Make rcuperf collect expedited event-trace data This commit enables ftrace in the rcuperf TREE kernel build and adds an ftrace_dump() at the end of rcuperf processing. This data will be used to measure the actual durations of the expedited grace periods without the added delays inherent in the kernel-module measurements. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 1 + tools/testing/selftests/rcutorture/configs/rcuperf/TREE | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e18d016a9888..12561f96f0a2 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -404,6 +404,7 @@ rcu_perf_writer(void *arg) perf_type, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { + rcu_ftrace_dump(DUMP_ALL); PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE index 614e107f6db5..a312f671a29a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -17,3 +17,4 @@ CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y From 2b03d038457fc8d694d34981cb0a2f1702ba35d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 16:51:36 -0800 Subject: [PATCH 31/45] rcutorture: Make scripts analyze rcuperf trace data, if present The rcuperf event-trace data is more accurate than are the rcuperf printk()s because locking keeps things ordered. This commit therefore parses and analyzes this event-trace data if present, and falls back on the printk()s otherwise. Signed-off-by: Paul E. McKenney --- .../bin/kvm-recheck-rcuperf-ftrace.sh | 121 ++++++++++++++++++ .../rcutorture/bin/kvm-recheck-rcuperf.sh | 8 ++ 2 files changed, 129 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh new file mode 100755 index 000000000000..f79b0e9e84fc --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements, +# looking for ftrace data. Exits with 0 if data was found, analyzed, and +# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after +# argument checking. +# +# Usage: kvm-recheck-rcuperf-ftrace.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +. tools/testing/selftests/rcutorture/bin/functions.sh + +if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100 +then + exit 10 +fi + +sed -e 's/^\[[^]]*]//' < $i/console.log | +grep 'us : rcu_exp_grace_period' | +sed -e 's/us : / : /' | +tr -d '\015' | +awk ' +$8 == "start" { + if (starttask != "") + nlost++; + starttask = $1; + starttime = $3; + startseq = $7; +} + +$8 == "end" { + if (starttask == $1 && startseq == $7) { + curgpdur = $3 - starttime; + gptimes[++n] = curgpdur; + gptaskcnt[starttask]++; + sum += curgpdur; + if (curgpdur > 1000) + print "Long GP " starttime "us to " $3 "us (" curgpdur "us)"; + starttask = ""; + } else { + # Lost a message or some such, reset. + starttask = ""; + nlost++; + } +} + +$8 == "done" { + piggybackcnt[$1]++; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No ftrace records found???" + exit 10; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Distribution of grace periods across tasks:"; + for (i in gptaskcnt) { + print "\t" i, gptaskcnt[i]; + nbatches += gptaskcnt[i]; + } + ngps = nbatches; + print "Distribution of piggybacking across tasks:"; + for (i in piggybackcnt) { + print "\t" i, piggybackcnt[i]; + ngps += piggybackcnt[i]; + } + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0; + print "Computed from ftrace data."; +}' +exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index 1f72df8eedc7..8f3121afc716 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -30,8 +30,15 @@ else echo Unreadable results directory: $i exit 1 fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . tools/testing/selftests/rcutorture/bin/functions.sh +if kvm-recheck-rcuperf-ftrace.sh $i +then + # ftrace data was successfully analyzed, call it good! + exit 0 +fi + configfile=`echo $i | sed -e 's/^.*\///'` sed -e 's/^\[[^]]*]//' < $i/console.log | @@ -85,4 +92,5 @@ END { print "99th percentile grace-period duration: " gptimes[pct99]; print "Maximum grace-period duration: " gptimes[newNR]; print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; + print "Computed from rcuperf printk output."; }' From df37e66bfdbb57e8cae7dbf39a0c66b1b8701338 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 20:56:38 -0800 Subject: [PATCH 32/45] rcutorture: Add rcuperf holdoff boot parameter to reduce interference Boot-time activity can legitimately grab CPUs for extended time periods, so the commit adds a boot parameter to delay the start of the performance test until boot has completed. Defaults to 10 seconds. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++++++ kernel/rcu/rcuperf.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 951af481da5a..da9ee466789b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3288,6 +3288,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Measure performance of expedited synchronous grace-period primitives. + rcuperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 12561f96f0a2..278600143bb6 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -59,6 +59,7 @@ MODULE_AUTHOR("Paul E. McKenney "); do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); @@ -368,6 +369,10 @@ rcu_perf_writer(void *arg) set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; From 620316e52a923811fe9a77ceb43eebf5f507d375 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 21:32:09 -0800 Subject: [PATCH 33/45] rcutorture: Avoid RCU CPU stall warning and RT throttling Running rcuperf can result in RCU CPU stall warnings and RT throttling. These occur because on of the real-time writer processes does ftrace_dump() while still running at real-time priority. This commit therefore prevents these problems by setting the writer thread back to SCHED_NORMAL (AKA SCHED_OTHER) before doing ftrace_dump(). In addition, this commit adds a small fixed delay before dumping ftrace buffer in order to decrease the probability that this dumping will interfere with other writers' grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 278600143bb6..4c0572859ff0 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -404,11 +404,15 @@ rcu_perf_writer(void *arg) started = true; if (!done && i >= MIN_MEAS) { done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); pr_alert("%s" PERF_FLAG "rcu_perf_writer %ld has %d measurements\n", perf_type, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { + schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; From dba6f1bab8920a6f78b0dc21976afdecf82fba3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Feb 2016 16:39:38 -0800 Subject: [PATCH 34/45] rcutorture: Add largish-system rcuperf scenario This commit adds an rcuperf scenario named TREE54 that uses 54 CPUs and provides a four-level rcu_node combining tree. Signed-off-by: Paul E. McKenney --- .../rcutorture/configs/rcuperf/TREE54 | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 new file mode 100644 index 000000000000..985fb170d13c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 @@ -0,0 +1,23 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=54 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_FANOUT=3 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y From e6fb1fc1085e5b5155bc8f3d3385c48b8bdde95e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Sun, 7 Feb 2016 13:31:39 +0100 Subject: [PATCH 35/45] rcuperf: Do not wake up shutdown wait queue if "shutdown" is false. After finishing its tests rcuperf tries to wake up shutdown_wq even if "shutdown" param is set to false, resulting in a wake_up() call on an unitialized wait_queue_head_t which leads to "BUG: spinlock bad magic" and "BUG: unable to handle kernel NULL pointer dereference". Fix by checking "shutdown" param before waking up the queue. Signed-off-by: Artem Savkov --- kernel/rcu/rcuperf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 4c0572859ff0..3cee0d8393ed 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -423,8 +423,10 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_finished = cur_ops->completed(); } - smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } } } if (done && !alldone && From 67522beecfc75d133514dda64107ee19125a74b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Mar 2016 08:52:19 -0800 Subject: [PATCH 36/45] rcutorture: Remove redundant initialization to zero The current code initializes the global per-CPU variables rcu_torture_count and rcu_torture_batch to zero. However, C does this initialization by default, and explicit initialization of per-CPU variables now needs a different syntax if "make tags" is to work. This commit therefore removes the initialization. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9234e75b106a..52b49fe90919 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; From de26ca19a530d2d822a6816834d22022e94b2e53 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 17 Mar 2016 11:14:35 +0100 Subject: [PATCH 37/45] rcutorture: Consider FROZEN hotplug notifier transitions The hotplug notifier rcutorture_cpu_notify() doesn't consider the corresponding CPU_XXX_FROZEN transitions. They occur on suspend/resume and are usually handled the same way as the corresponding non frozen transitions. Mask the switch case action argument with '~CPU_TASKS_FROZEN' to map CPU_XXX_FROZEN hotplug transitions on corresponding non-frozen transitions. Cc: Josh Triplett Cc: "Paul E. McKenney" Signed-off-by: Anna-Maria Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 52b49fe90919..633a68a09440 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1585,7 +1585,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); From 9eb5188a0704bd21eb7e4aef83b904fad43d3ec8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Mar 2016 15:36:40 -0700 Subject: [PATCH 38/45] torture: Clarify refusal to run more than one torture test This commit clarifies error messages -- you only get to run one torture test at a time! Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..e912ccd960f0 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -602,8 +602,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } From fb2c66af10f92bc83659c4d8a32e02287f0e5dda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2016 14:44:42 -0700 Subject: [PATCH 39/45] torture: Kill qemu, not parent process The current hang-check machinery in the rcutorture scripts uses "$!" of a parenthesized bash statement to capture the pid. Unfortunately, this captures not qemu's pid, but rather that of its parent that implements the parenthesized statement. This commit therefore adjusts things so as to capture qemu's actual pid, which then allows the script to actually kill qemu in event of a kernel hang. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0f80eefb0bfd..2eb8fefbe7d9 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -168,14 +168,25 @@ then fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & -qemu_pid=$! +( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 -echo Monitoring qemu job at pid $qemu_pid +sleep 10 # Give qemu's pid a chance to reach the file +if test -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid +else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid +fi while : do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then if test $kruntime -ge $seconds then @@ -195,12 +206,16 @@ do ps -fp $killpid >> $resdir/Warnings 2>&1 fi else - echo ' ---' `date`: Kernel done + echo ' ---' `date`: "Kernel done" fi break fi done -if test $commandcompleted -eq 0 +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" then echo Grace period for qemu job at pid $qemu_pid while : @@ -220,6 +235,9 @@ then fi sleep 1 done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command fi parse-torture.sh $resdir/console.log $title From 480b1eb659f65be8ed039f1a9db3f762c41c9770 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2016 10:50:38 -0700 Subject: [PATCH 40/45] rcutorture: Convert test duration to seconds early This commit converts test duration from minutes to seconds early on in order to prepare for upcoming OS-jitter-injection changes. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 5 ++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 2eb8fefbe7d9..73a265668421 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -6,7 +6,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args # # qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with # arguments specifying the number of CPUs and other @@ -123,8 +123,7 @@ while test -f $builddir.ready do sleep 1 done -minutes=$4 -seconds=$(($minutes * 60)) +seconds=$4 qemu_args=$5 boot_args=$6 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index c33cb582b3dc..704e219f67a7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T -dur=30 +dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH @@ -116,7 +116,7 @@ do ;; --duration) checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$2 + dur=$(($2*60)) shift ;; --interactive) From 6e524a603f0b72281019e4ec29b1022388f9f231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2016 14:22:26 -0700 Subject: [PATCH 41/45] rcutorture: Add OS-jitter capability This commit adds a --jitter OS-jitter capability to expose bugs based on no-delay assumptions. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/jitter.sh | 90 +++++++++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 18 ++++ 2 files changed, 108 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/jitter.sh diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh new file mode 100755 index 000000000000..3633828375e3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Alternate sleeping and spinning on randomly selected CPUs. The purpose +# of this script is to inflict random OS jitter on a concurrently running +# test. +# +# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# +# me: Random-number-generator seed salt. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +me=$(($1 * 1000)) +duration=$2 +sleepmax=${3-1000000} +spinmax=${4-1000} + +n=1 + +starttime=`awk 'BEGIN { print systime(); }' < /dev/null` + +while : +do + # Check for done. + t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` + if test "$t" -gt "$duration" + then + exit 0; + fi + + # Set affinity to randomly selected CPU + cpus=`ls /sys/devices/system/cpu/*/online | + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | + grep -v '^0*$'` + cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { + srand(n + me + systime()); + ncpus = split(cpus, ca); + curcpu = ca[int(rand() * ncpus + 1)]; + mask = lshift(1, curcpu); + if (mask + 0 <= 0) + mask = 1; + printf("%#x\n", mask); + }' < /dev/null` + n=$(($n+1)) + if ! taskset -p $cpumask $$ > /dev/null 2>&1 + then + echo taskset failure: '"taskset -p ' $cpumask $$ '"' + exit 1 + fi + + # Sleep a random duration + sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * sleepmax)); + }' < /dev/null` + n=$(($n+1)) + sleep .$sleeptime + + # Spin a random duration + limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * spinmax)); + }' < /dev/null` + n=$(($n+1)) + for i in {1..$limit} + do + echo > /dev/null + done +done + +exit 1 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 704e219f67a7..0d598145873e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -48,6 +48,7 @@ resdir="" configs="" cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` +jitter=0 . functions.sh @@ -63,6 +64,7 @@ usage () { echo " --dryrun sched|script" echo " --duration minutes" echo " --interactive" + echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" @@ -122,6 +124,11 @@ do --interactive) TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE ;; + --jitter) + checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$' + jitter="$2" + shift + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ + -v jitter="$jitter" \ -v rd=$resdir/$ds/ \ -v dur=$dur \ -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ @@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum) print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; print "fi" } + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + for (j = 0; j < njitter; j++) + print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait" print "if test -z \"$TORTURE_BUILDONLY\"" print "then" From acc1adf5572205c5b3fc9e6983ca8dfb06c94520 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 10:48:06 -0700 Subject: [PATCH 42/45] rcutorture: Don't rebuild identical kernel Currently, if the user specifies multiple runs of a given test configuration, the scripting does multiple kernel builds. This wastes both time and disk space, so this commit makes the scripting use the first build for all runs of a given test configuration. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 73a265668421..4109f306d855 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -91,25 +91,33 @@ fi # CONFIG_PCMCIA=n # CONFIG_CARDBUS=n # CONFIG_YENTA=n -if kvm-build.sh $config_template $builddir $T +base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` +if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux then + # Rerunning previous test, so use that test's kernel. + QEMU="`identify_qemu $base_resdir/vmlinux`" + KERNEL=$base_resdir/bzImage + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh +elif kvm-build.sh $config_template $builddir $T +then + # Had to build a kernel for this test. QEMU="`identify_qemu $builddir/vmlinux`" BOOT_IMAGE="`identify_boot_image $QEMU`" cp $builddir/Make*.out $resdir + cp $builddir/vmlinux $resdir cp $builddir/.config $resdir if test -n "$BOOT_IMAGE" then cp $builddir/$BOOT_IMAGE $resdir + KERNEL=$resdir/bzImage else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? fi parse-build.sh $resdir/Make.out $title - if test -f $builddir.wait - then - mv $builddir.wait $builddir.ready - fi else + # Build failed. cp $builddir/Make*.out $resdir cp $builddir/.config $resdir || : echo Build failed, not running KVM, see $resdir. @@ -119,6 +127,10 @@ else fi exit 1 fi +if test -f $builddir.wait +then + mv $builddir.wait $builddir.ready +fi while test -f $builddir.ready do sleep 1 @@ -166,8 +178,8 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" From e9fb365a8847dfe8a9fccae0dce77abf7276b5da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 11:20:48 -0700 Subject: [PATCH 43/45] rcutorture: Dump trace buffer upon shutdown When running from the scripts, rcutorture is completely headless, so there is no way to to manually dump the trace buffer. This commit therefore unconditionally dumps the trace buffer upon timed shutdown. However, if you are using rmmod to end the test, it is still up to you to manually dump the trace buffer. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/torture.c b/kernel/torture.c index e912ccd960f0..fa0bdeee17ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } From 0aa67e75b3d59cfe412bfa54ca23797e6c2e3270 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 11:40:44 -0700 Subject: [PATCH 44/45] rcutorture: Add irqs-disabled test for call_rcu() Mutation testing carried out by Iftekhar Ahmed of Oregon State University showed that rcutorture is failing to test invocations of call_rcu() having interrupts disabled. This commit therefore adds interrupt disabling around one of the existing invocations of call_rcu() (and friends). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 633a68a09440..084a28a732eb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1478,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); From a54062c0d95921d4fb0edc8d268021bf387e6c75 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 14:16:22 -0700 Subject: [PATCH 45/45] rcutorture: Add boot-time adjustment of leaf fanout Currently, the rcutorture scripts do not test boot-time adjustment of leaf fanout (via the rcutree.rcu_fanout_leaf boot parameter), as was noted during testing carried out by Iftekhar Ahmed of Oregon State University. This commit therefore adjusts TREE04's CONFIG_RCU_FANOUT_LEAF from 4 to 3, and also adds rcutree.rcu_fanout_leaf=4 to its boot parameters. This change forces RCU's boot-time geometry-change code to be exercised. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE04 | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 39a2c6d7d7ec..17cbe098b115 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 -CONFIG_RCU_FANOUT_LEAF=4 +CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 0fc8a3428938..e34c33430447 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4