From c06aed0e31008a248c1841f1b7fc80e9ee242a31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Jul 2018 11:25:23 -0700 Subject: [PATCH] rcu: Compute jiffies_till_sched_qs from other kernel parameters The jiffies_till_sched_qs value used to determine how old a grace period must be before RCU enlists the help of the scheduler to force a quiescent state on the holdout CPU. Currently, this defaults to HZ/10 regardless of system size and may be set only at boot time. This can be a problem for very large systems, because if the values of the jiffies_till_first_fqs and jiffies_till_next_fqs kernel parameters are left at their defaults, they are calculated to increase as the number of CPUs actually configured on the system increases. Thus, on a sufficiently large system, RCU would enlist the help of the scheduler before the grace-period kthread had a chance to scan for idle CPUs, which wastes CPU time. This commit therefore allows jiffies_till_sched_qs to be set, if desired, but if left as default, computes is as jiffies_till_first_fqs plus twice jiffies_till_next_fqs, thus allowing three force-quiescent-state scans for idle CPUs. This scales with the number of CPUs, providing sensible default values. Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 9 ++- kernel/rcu/tree.c | 63 ++++++++++++++----- kernel/rcu/tree_plugin.h | 2 + 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aa96e669bcb8..6153fb62abe1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3595,7 +3595,14 @@ Set required age in jiffies for a given grace period before RCU starts soliciting quiescent-state help from - rcu_note_context_switch(). + rcu_note_context_switch(). If not specified, the + kernel will calculate a value based on the most + recent settings of rcutree.jiffies_till_first_fqs + and rcutree.jiffies_till_next_fqs. + This calculated value may be viewed in + rcutree.jiffies_to_sched_qs. Any attempt to + set rcutree.jiffies_to_sched_qs will be + cheerfully overwritten. rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bc42c600027c..6bd0951a5f3a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -396,13 +396,47 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = ULONG_MAX; +module_param(jiffies_till_sched_qs, ulong, 0444); +static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ + +/* + * Make sure that we give the grace-period kthread time to detect any + * idle CPUs before taking active measures to force quiescent states. + * However, don't go below 100 milliseconds, adjusted upwards for really + * large systems. + */ +static void adjust_jiffies_till_sched_qs(void) +{ + unsigned long j; + + /* If jiffies_till_sched_qs was specified, respect the request. */ + if (jiffies_till_sched_qs != ULONG_MAX) { + WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); + return; + } + j = READ_ONCE(jiffies_till_first_fqs) + + 2 * READ_ONCE(jiffies_till_next_fqs); + if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) + j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j); + WRITE_ONCE(jiffies_to_sched_qs, j); +} + static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) { ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -411,8 +445,10 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -430,13 +466,6 @@ module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_fi module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); -/* - * How long the grace period must be before we start recruiting - * quiescent-state help from rcu_note_context_switch(). - */ -static ulong jiffies_till_sched_qs = HZ / 10; -module_param(jiffies_till_sched_qs, ulong, 0444); - static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); static void force_quiescent_state(void); static int rcu_pending(void); @@ -1041,16 +1070,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* * A CPU running for an extended time within the kernel can - * delay RCU grace periods: (1) At age jiffies_till_sched_qs, - * set .rcu_urgent_qs, (2) At age 2*jiffies_till_sched_qs, set + * delay RCU grace periods: (1) At age jiffies_to_sched_qs, + * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the * unsynchronized assignments to the per-CPU rcu_need_heavy_qs * variable are safe because the assignments are repeated if this * CPU failed to pass through a quiescent state. This code - * also checks .jiffies_resched in case jiffies_till_sched_qs + * also checks .jiffies_resched in case jiffies_to_sched_qs * is set way high. */ - jtsq = jiffies_till_sched_qs; + jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && @@ -1236,7 +1265,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) gpa = READ_ONCE(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - jiffies_till_next_fqs, + READ_ONCE(jiffies_till_next_fqs), rcu_get_root()->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); @@ -1874,7 +1903,7 @@ static void rcu_gp_fqs_loop(void) struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; - j = jiffies_till_first_fqs; + j = READ_ONCE(jiffies_till_first_fqs); ret = 0; for (;;) { if (!ret) { @@ -1908,7 +1937,7 @@ static void rcu_gp_fqs_loop(void) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ - j = jiffies_till_next_fqs; + j = READ_ONCE(jiffies_till_next_fqs); } else { /* Deal with stray signal. */ cond_resched_tasks_rcu_qs(); @@ -3579,6 +3608,8 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; + if (jiffies_till_sched_qs == ULONG_MAX) + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 726d57708849..7ec366268e2e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -105,6 +105,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (jiffies_till_sched_qs != ULONG_MAX) + pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))