2009-08-23 04:56:45 +08:00
|
|
|
/*
|
|
|
|
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
|
|
|
|
* Internal non-public definitions.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2008
|
|
|
|
*
|
|
|
|
* Author: Ingo Molnar <mingo@elte.hu>
|
|
|
|
* Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/seqlock.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
|
|
|
|
* In theory, it should be possible to add more levels straightforwardly.
|
2010-12-15 08:07:52 +08:00
|
|
|
* In practice, this did work well going from three levels to four.
|
|
|
|
* Of course, your mileage may vary.
|
2009-08-23 04:56:45 +08:00
|
|
|
*/
|
2009-12-03 04:10:14 +08:00
|
|
|
#define MAX_RCU_LVLS 4
|
2010-12-15 08:07:52 +08:00
|
|
|
#if CONFIG_RCU_FANOUT > 16
|
|
|
|
#define RCU_FANOUT_LEAF 16
|
|
|
|
#else /* #if CONFIG_RCU_FANOUT > 16 */
|
|
|
|
#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
|
|
|
|
#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
|
|
|
|
#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
|
|
|
|
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
|
|
|
|
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
|
|
|
|
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2010-12-15 08:07:52 +08:00
|
|
|
#if NR_CPUS <= RCU_FANOUT_1
|
2009-08-23 04:56:45 +08:00
|
|
|
# define NUM_RCU_LVLS 1
|
|
|
|
# define NUM_RCU_LVL_0 1
|
|
|
|
# define NUM_RCU_LVL_1 (NR_CPUS)
|
|
|
|
# define NUM_RCU_LVL_2 0
|
|
|
|
# define NUM_RCU_LVL_3 0
|
2009-12-03 04:10:14 +08:00
|
|
|
# define NUM_RCU_LVL_4 0
|
2010-12-15 08:07:52 +08:00
|
|
|
#elif NR_CPUS <= RCU_FANOUT_2
|
2009-08-23 04:56:45 +08:00
|
|
|
# define NUM_RCU_LVLS 2
|
|
|
|
# define NUM_RCU_LVL_0 1
|
2010-12-15 08:07:52 +08:00
|
|
|
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
2009-08-23 04:56:45 +08:00
|
|
|
# define NUM_RCU_LVL_2 (NR_CPUS)
|
|
|
|
# define NUM_RCU_LVL_3 0
|
2009-12-03 04:10:14 +08:00
|
|
|
# define NUM_RCU_LVL_4 0
|
2010-12-15 08:07:52 +08:00
|
|
|
#elif NR_CPUS <= RCU_FANOUT_3
|
2009-08-23 04:56:45 +08:00
|
|
|
# define NUM_RCU_LVLS 3
|
|
|
|
# define NUM_RCU_LVL_0 1
|
2010-12-15 08:07:52 +08:00
|
|
|
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
|
|
|
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
|
|
|
# define NUM_RCU_LVL_3 (NR_CPUS)
|
2009-12-03 04:10:14 +08:00
|
|
|
# define NUM_RCU_LVL_4 0
|
2010-12-15 08:07:52 +08:00
|
|
|
#elif NR_CPUS <= RCU_FANOUT_4
|
2009-12-03 04:10:14 +08:00
|
|
|
# define NUM_RCU_LVLS 4
|
|
|
|
# define NUM_RCU_LVL_0 1
|
2010-12-15 08:07:52 +08:00
|
|
|
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
|
|
|
|
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
|
|
|
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
|
|
|
# define NUM_RCU_LVL_4 (NR_CPUS)
|
2009-08-23 04:56:45 +08:00
|
|
|
#else
|
|
|
|
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
|
2010-12-15 08:07:52 +08:00
|
|
|
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2009-12-03 04:10:14 +08:00
|
|
|
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
|
2009-08-23 04:56:45 +08:00
|
|
|
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dynticks per-CPU state.
|
|
|
|
*/
|
|
|
|
struct rcu_dynticks {
|
rcu: Decrease memory-barrier usage based on semi-formal proof
(Note: this was reverted, and is now being re-applied in pieces, with
this being the fifth and final piece. See below for the reason that
it is now felt to be safe to re-apply this.)
Commit d09b62d fixed grace-period synchronization, but left some smp_mb()
invocations in rcu_process_callbacks() that are no longer needed, but
sheer paranoia prevented them from being removed. This commit removes
them and provides a proof of correctness in their absence. It also adds
a memory barrier to rcu_report_qs_rsp() immediately before the update to
rsp->completed in order to handle the theoretical possibility that the
compiler or CPU might move massive quantities of code into a lock-based
critical section. This also proves that the sheer paranoia was not
entirely unjustified, at least from a theoretical point of view.
In addition, the old dyntick-idle synchronization depended on the fact
that grace periods were many milliseconds in duration, so that it could
be assumed that no dyntick-idle CPU could reorder a memory reference
across an entire grace period. Unfortunately for this design, the
addition of expedited grace periods breaks this assumption, which has
the unfortunate side-effect of requiring atomic operations in the
functions that track dyntick-idle state for RCU. (There is some hope
that the algorithms used in user-level RCU might be applied here, but
some work is required to handle the NMIs that user-space applications
can happily ignore. For the short term, better safe than sorry.)
This proof assumes that neither compiler nor CPU will allow a lock
acquisition and release to be reordered, as doing so can result in
deadlock. The proof is as follows:
1. A given CPU declares a quiescent state under the protection of
its leaf rcu_node's lock.
2. If there is more than one level of rcu_node hierarchy, the
last CPU to declare a quiescent state will also acquire the
->lock of the next rcu_node up in the hierarchy, but only
after releasing the lower level's lock. The acquisition of this
lock clearly cannot occur prior to the acquisition of the leaf
node's lock.
3. Step 2 repeats until we reach the root rcu_node structure.
Please note again that only one lock is held at a time through
this process. The acquisition of the root rcu_node's ->lock
must occur after the release of that of the leaf rcu_node.
4. At this point, we set the ->completed field in the rcu_state
structure in rcu_report_qs_rsp(). However, if the rcu_node
hierarchy contains only one rcu_node, then in theory the code
preceding the quiescent state could leak into the critical
section. We therefore precede the update of ->completed with a
memory barrier. All CPUs will therefore agree that any updates
preceding any report of a quiescent state will have happened
before the update of ->completed.
5. Regardless of whether a new grace period is needed, rcu_start_gp()
will propagate the new value of ->completed to all of the leaf
rcu_node structures, under the protection of each rcu_node's ->lock.
If a new grace period is needed immediately, this propagation
will occur in the same critical section that ->completed was
set in, but courtesy of the memory barrier in #4 above, is still
seen to follow any pre-quiescent-state activity.
6. When a given CPU invokes __rcu_process_gp_end(), it becomes
aware of the end of the old grace period and therefore makes
any RCU callbacks that were waiting on that grace period eligible
for invocation.
If this CPU is the same one that detected the end of the grace
period, and if there is but a single rcu_node in the hierarchy,
we will still be in the single critical section. In this case,
the memory barrier in step #4 guarantees that all callbacks will
be seen to execute after each CPU's quiescent state.
On the other hand, if this is a different CPU, it will acquire
the leaf rcu_node's ->lock, and will again be serialized after
each CPU's quiescent state for the old grace period.
On the strength of this proof, this commit therefore removes the memory
barriers from rcu_process_callbacks() and adds one to rcu_report_qs_rsp().
The effect is to reduce the number of memory barriers by one and to
reduce the frequency of execution from about once per scheduling tick
per CPU to once per grace period.
This was reverted do to hangs found during testing by Yinghai Lu and
Ingo Molnar. Frederic Weisbecker supplied Yinghai with tracing that
located the underlying problem, and Frederic also provided the fix.
The underlying problem was that the HARDIRQ_ENTER() macro from
lib/locking-selftest.c invoked irq_enter(), which in turn invokes
rcu_irq_enter(), but HARDIRQ_EXIT() invoked __irq_exit(), which
does not invoke rcu_irq_exit(). This situation resulted in calls
to rcu_irq_enter() that were not balanced by the required calls to
rcu_irq_exit(). Therefore, after these locking selftests completed,
RCU's dyntick-idle nesting count was a large number (for example,
72), which caused RCU to to conclude that the affected CPU was not in
dyntick-idle mode when in fact it was.
RCU would therefore incorrectly wait for this dyntick-idle CPU, resulting
in hangs.
In contrast, with Frederic's patch, which replaces the irq_enter()
in HARDIRQ_ENTER() with an __irq_enter(), these tests don't ever call
either rcu_irq_enter() or rcu_irq_exit(), which works because the CPU
running the test is already marked as not being in dyntick-idle mode.
This means that the rcu_irq_enter() and rcu_irq_exit() calls and RCU
then has no problem working out which CPUs are in dyntick-idle mode and
which are not.
The reason that the imbalance was not noticed before the barrier patch
was applied is that the old implementation of rcu_enter_nohz() ignored
the nesting depth. This could still result in delays, but much shorter
ones. Whenever there was a delay, RCU would IPI the CPU with the
unbalanced nesting level, which would eventually result in rcu_enter_nohz()
being called, which in turn would force RCU to see that the CPU was in
dyntick-idle mode.
The reason that very few people noticed the problem is that the mismatched
irq_enter() vs. __irq_exit() occured only when the kernel was built with
CONFIG_DEBUG_LOCKING_API_SELFTESTS.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2010-09-08 01:38:22 +08:00
|
|
|
int dynticks_nesting; /* Track irq/process nesting level. */
|
|
|
|
int dynticks_nmi_nesting; /* Track NMI nesting level. */
|
|
|
|
atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
|
2009-08-23 04:56:45 +08:00
|
|
|
};
|
|
|
|
|
2011-03-30 08:48:28 +08:00
|
|
|
/* RCU's kthread states for tracing. */
|
|
|
|
#define RCU_KTHREAD_STOPPED 0
|
|
|
|
#define RCU_KTHREAD_RUNNING 1
|
|
|
|
#define RCU_KTHREAD_WAITING 2
|
2011-04-07 07:01:16 +08:00
|
|
|
#define RCU_KTHREAD_OFFCPU 3
|
|
|
|
#define RCU_KTHREAD_YIELDING 4
|
|
|
|
#define RCU_KTHREAD_MAX 4
|
2011-03-30 08:48:28 +08:00
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
/*
|
|
|
|
* Definition for node within the RCU grace-period-detection hierarchy.
|
|
|
|
*/
|
|
|
|
struct rcu_node {
|
2010-02-23 09:05:02 +08:00
|
|
|
raw_spinlock_t lock; /* Root rcu_node's lock protects some */
|
2009-09-24 00:50:42 +08:00
|
|
|
/* rcu_state fields as well as following. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long gpnum; /* Current grace period for this node. */
|
2009-08-28 06:00:12 +08:00
|
|
|
/* This will either be equal to or one */
|
|
|
|
/* behind the root rcu_node's gpnum. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long completed; /* Last GP completed for this node. */
|
2009-11-03 05:52:28 +08:00
|
|
|
/* This will either be equal to or one */
|
|
|
|
/* behind the root rcu_node's gpnum. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long qsmask; /* CPUs or groups that need to switch in */
|
|
|
|
/* order for current grace period to proceed.*/
|
2009-09-24 00:50:42 +08:00
|
|
|
/* In leaf rcu_node, each bit corresponds to */
|
|
|
|
/* an rcu_data structure, otherwise, each */
|
|
|
|
/* bit corresponds to a child rcu_node */
|
|
|
|
/* structure. */
|
2010-11-30 13:56:39 +08:00
|
|
|
unsigned long expmask; /* Groups that have ->blkd_tasks */
|
2009-12-03 04:10:15 +08:00
|
|
|
/* elements that need to drain to allow the */
|
|
|
|
/* current expedited grace period to */
|
|
|
|
/* complete (only for TREE_PREEMPT_RCU). */
|
rcu: Avoid acquiring rcu_node locks in timer functions
This commit switches manipulations of the rcu_node ->wakemask field
to atomic operations, which allows rcu_cpu_kthread_timer() to avoid
acquiring the rcu_node lock. This should avoid the following lockdep
splat reported by Valdis Kletnieks:
[ 12.872150] usb 1-4: new high speed USB device number 3 using ehci_hcd
[ 12.986667] usb 1-4: New USB device found, idVendor=413c, idProduct=2513
[ 12.986679] usb 1-4: New USB device strings: Mfr=0, Product=0, SerialNumber=0
[ 12.987691] hub 1-4:1.0: USB hub found
[ 12.987877] hub 1-4:1.0: 3 ports detected
[ 12.996372] input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input10
[ 13.071471] udevadm used greatest stack depth: 3984 bytes left
[ 13.172129]
[ 13.172130] =======================================================
[ 13.172425] [ INFO: possible circular locking dependency detected ]
[ 13.172650] 2.6.39-rc6-mmotm0506 #1
[ 13.172773] -------------------------------------------------------
[ 13.172997] blkid/267 is trying to acquire lock:
[ 13.173009] (&p->pi_lock){-.-.-.}, at: [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[ 13.173009]
[ 13.173009] but task is already holding lock:
[ 13.173009] (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58
[ 13.173009]
[ 13.173009] which lock already depends on the new lock.
[ 13.173009]
[ 13.173009]
[ 13.173009] the existing dependency chain (in reverse order) is:
[ 13.173009]
[ 13.173009] -> #2 (rcu_node_level_0){..-...}:
[ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[ 13.173009] [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45
[ 13.173009] [<ffffffff81090794>] rcu_read_unlock_special+0x8c/0x1d5
[ 13.173009] [<ffffffff8109092c>] __rcu_read_unlock+0x4f/0xd7
[ 13.173009] [<ffffffff81027bd3>] rcu_read_unlock+0x21/0x23
[ 13.173009] [<ffffffff8102cc34>] cpuacct_charge+0x6c/0x75
[ 13.173009] [<ffffffff81030cc6>] update_curr+0x101/0x12e
[ 13.173009] [<ffffffff810311d0>] check_preempt_wakeup+0xf7/0x23b
[ 13.173009] [<ffffffff8102acb3>] check_preempt_curr+0x2b/0x68
[ 13.173009] [<ffffffff81031d40>] ttwu_do_wakeup+0x76/0x128
[ 13.173009] [<ffffffff81031e49>] ttwu_do_activate.constprop.63+0x57/0x5c
[ 13.173009] [<ffffffff81031e96>] scheduler_ipi+0x48/0x5d
[ 13.173009] [<ffffffff810177d5>] smp_reschedule_interrupt+0x16/0x18
[ 13.173009] [<ffffffff815710f3>] reschedule_interrupt+0x13/0x20
[ 13.173009] [<ffffffff810b66d1>] rcu_read_unlock+0x21/0x23
[ 13.173009] [<ffffffff810b739c>] find_get_page+0xa9/0xb9
[ 13.173009] [<ffffffff810b8b48>] filemap_fault+0x6a/0x34d
[ 13.173009] [<ffffffff810d1a25>] __do_fault+0x54/0x3e6
[ 13.173009] [<ffffffff810d447a>] handle_pte_fault+0x12c/0x1ed
[ 13.173009] [<ffffffff810d48f7>] handle_mm_fault+0x1cd/0x1e0
[ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30
[ 13.173009]
[ 13.173009] -> #1 (&rq->lock){-.-.-.}:
[ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[ 13.173009] [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45
[ 13.173009] [<ffffffff81027e19>] __task_rq_lock+0x8b/0xd3
[ 13.173009] [<ffffffff81032f7f>] wake_up_new_task+0x41/0x108
[ 13.173009] [<ffffffff810376c3>] do_fork+0x265/0x33f
[ 13.173009] [<ffffffff81007d02>] kernel_thread+0x6b/0x6d
[ 13.173009] [<ffffffff8153a9dd>] rest_init+0x21/0xd2
[ 13.173009] [<ffffffff81b1db4f>] start_kernel+0x3bb/0x3c6
[ 13.173009] [<ffffffff81b1d29f>] x86_64_start_reservations+0xaf/0xb3
[ 13.173009] [<ffffffff81b1d393>] x86_64_start_kernel+0xf0/0xf7
[ 13.173009]
[ 13.173009] -> #0 (&p->pi_lock){-.-.-.}:
[ 13.173009] [<ffffffff81067788>] check_prev_add+0x68/0x20e
[ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[ 13.173009] [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57
[ 13.173009] [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[ 13.173009] [<ffffffff81032f3c>] wake_up_process+0x10/0x12
[ 13.173009] [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58
[ 13.173009] [<ffffffff81045286>] call_timer_fn+0xac/0x1e9
[ 13.173009] [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2
[ 13.173009] [<ffffffff8103e487>] __do_softirq+0x109/0x26a
[ 13.173009] [<ffffffff8157144c>] call_softirq+0x1c/0x30
[ 13.173009] [<ffffffff81003207>] do_softirq+0x44/0xf1
[ 13.173009] [<ffffffff8103e8b9>] irq_exit+0x58/0xc8
[ 13.173009] [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87
[ 13.173009] [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20
[ 13.173009] [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310
[ 13.173009] [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243
[ 13.173009] [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a
[ 13.173009] [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b
[ 13.173009] [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0
[ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30
[ 13.173009]
[ 13.173009] other info that might help us debug this:
[ 13.173009]
[ 13.173009] Chain exists of:
[ 13.173009] &p->pi_lock --> &rq->lock --> rcu_node_level_0
[ 13.173009]
[ 13.173009] Possible unsafe locking scenario:
[ 13.173009]
[ 13.173009] CPU0 CPU1
[ 13.173009] ---- ----
[ 13.173009] lock(rcu_node_level_0);
[ 13.173009] lock(&rq->lock);
[ 13.173009] lock(rcu_node_level_0);
[ 13.173009] lock(&p->pi_lock);
[ 13.173009]
[ 13.173009] *** DEADLOCK ***
[ 13.173009]
[ 13.173009] 3 locks held by blkid/267:
[ 13.173009] #0: (&mm->mmap_sem){++++++}, at: [<ffffffff8156cdb4>] do_page_fault+0x1f3/0x5de
[ 13.173009] #1: (&yield_timer){+.-...}, at: [<ffffffff810451da>] call_timer_fn+0x0/0x1e9
[ 13.173009] #2: (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58
[ 13.173009]
[ 13.173009] stack backtrace:
[ 13.173009] Pid: 267, comm: blkid Not tainted 2.6.39-rc6-mmotm0506 #1
[ 13.173009] Call Trace:
[ 13.173009] <IRQ> [<ffffffff8154a529>] print_circular_bug+0xc8/0xd9
[ 13.173009] [<ffffffff81067788>] check_prev_add+0x68/0x20e
[ 13.173009] [<ffffffff8100c861>] ? save_stack_trace+0x28/0x46
[ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[ 13.173009] [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57
[ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[ 13.173009] [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[ 13.173009] [<ffffffff81032f3c>] wake_up_process+0x10/0x12
[ 13.173009] [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58
[ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[ 13.173009] [<ffffffff81045286>] call_timer_fn+0xac/0x1e9
[ 13.173009] [<ffffffff810451da>] ? del_timer+0x75/0x75
[ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[ 13.173009] [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2
[ 13.173009] [<ffffffff8103e487>] __do_softirq+0x109/0x26a
[ 13.173009] [<ffffffff8106365f>] ? tick_dev_program_event+0x37/0xf6
[ 13.173009] [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f
[ 13.173009] [<ffffffff8157144c>] call_softirq+0x1c/0x30
[ 13.173009] [<ffffffff81003207>] do_softirq+0x44/0xf1
[ 13.173009] [<ffffffff8103e8b9>] irq_exit+0x58/0xc8
[ 13.173009] [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87
[ 13.173009] [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20
[ 13.173009] <EOI> [<ffffffff810bd384>] ? get_page_from_freelist+0x114/0x310
[ 13.173009] [<ffffffff810bd51a>] ? get_page_from_freelist+0x2aa/0x310
[ 13.173009] [<ffffffff812220e7>] ? clear_page_c+0x7/0x10
[ 13.173009] [<ffffffff810bd1ef>] ? prep_new_page+0x14c/0x1cd
[ 13.173009] [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310
[ 13.173009] [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243
[ 13.173009] [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99
[ 13.173009] [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a
[ 13.173009] [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99
[ 13.173009] [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b
[ 13.173009] [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0
[ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[ 13.173009] [<ffffffff810d915f>] ? sys_brk+0x32/0x10c
[ 13.173009] [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f
[ 13.173009] [<ffffffff81065c4f>] ? trace_hardirqs_off_caller+0x3f/0x9c
[ 13.173009] [<ffffffff812235dd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30
[ 14.010075] usb 5-1: new full speed USB device number 2 using uhci_hcd
Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-11 20:41:41 +08:00
|
|
|
atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
|
|
|
|
/* Since this has meaning only for leaf */
|
|
|
|
/* rcu_node structures, 32 bits suffices. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long qsmaskinit;
|
2009-12-03 04:10:15 +08:00
|
|
|
/* Per-GP initial value for qsmask & expmask. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long grpmask; /* Mask to apply to parent qsmask. */
|
2009-09-24 00:50:42 +08:00
|
|
|
/* Only one bit will be set in this mask. */
|
2009-08-23 04:56:45 +08:00
|
|
|
int grplo; /* lowest-numbered CPU or group here. */
|
|
|
|
int grphi; /* highest-numbered CPU or group here. */
|
|
|
|
u8 grpnum; /* CPU/group number for next level up. */
|
|
|
|
u8 level; /* root is at level 0. */
|
|
|
|
struct rcu_node *parent;
|
2010-11-30 13:56:39 +08:00
|
|
|
struct list_head blkd_tasks;
|
|
|
|
/* Tasks blocked in RCU read-side critical */
|
|
|
|
/* section. Tasks are placed at the head */
|
|
|
|
/* of this list and age towards the tail. */
|
|
|
|
struct list_head *gp_tasks;
|
|
|
|
/* Pointer to the first task blocking the */
|
|
|
|
/* current grace period, or NULL if there */
|
|
|
|
/* is no such task. */
|
|
|
|
struct list_head *exp_tasks;
|
|
|
|
/* Pointer to the first task blocking the */
|
|
|
|
/* current expedited grace period, or NULL */
|
|
|
|
/* if there is no such task. If there */
|
|
|
|
/* is no current expedited grace period, */
|
|
|
|
/* then there can cannot be any such task. */
|
2011-02-08 04:47:15 +08:00
|
|
|
#ifdef CONFIG_RCU_BOOST
|
|
|
|
struct list_head *boost_tasks;
|
|
|
|
/* Pointer to first task that needs to be */
|
|
|
|
/* priority boosted, or NULL if no priority */
|
|
|
|
/* boosting is needed for this rcu_node */
|
|
|
|
/* structure. If there are no tasks */
|
|
|
|
/* queued on this rcu_node structure that */
|
|
|
|
/* are blocking the current grace period, */
|
|
|
|
/* there can be no such task. */
|
|
|
|
unsigned long boost_time;
|
|
|
|
/* When to start boosting (jiffies). */
|
|
|
|
struct task_struct *boost_kthread_task;
|
|
|
|
/* kthread that takes care of priority */
|
|
|
|
/* boosting for this rcu_node structure. */
|
2011-03-30 08:48:28 +08:00
|
|
|
unsigned int boost_kthread_status;
|
|
|
|
/* State of boost_kthread_task for tracing. */
|
2011-02-23 05:42:43 +08:00
|
|
|
unsigned long n_tasks_boosted;
|
|
|
|
/* Total number of tasks boosted. */
|
|
|
|
unsigned long n_exp_boosts;
|
|
|
|
/* Number of tasks boosted for expedited GP. */
|
|
|
|
unsigned long n_normal_boosts;
|
|
|
|
/* Number of tasks boosted for normal GP. */
|
|
|
|
unsigned long n_balk_blkd_tasks;
|
|
|
|
/* Refused to boost: no blocked tasks. */
|
|
|
|
unsigned long n_balk_exp_gp_tasks;
|
|
|
|
/* Refused to boost: nothing blocking GP. */
|
|
|
|
unsigned long n_balk_boost_tasks;
|
|
|
|
/* Refused to boost: already boosting. */
|
|
|
|
unsigned long n_balk_notblocked;
|
|
|
|
/* Refused to boost: RCU RS CS still running. */
|
|
|
|
unsigned long n_balk_notyet;
|
|
|
|
/* Refused to boost: not yet time. */
|
|
|
|
unsigned long n_balk_nos;
|
|
|
|
/* Refused to boost: not sure why, though. */
|
|
|
|
/* This can happen due to race conditions. */
|
2011-02-08 04:47:15 +08:00
|
|
|
#endif /* #ifdef CONFIG_RCU_BOOST */
|
2011-01-13 06:10:23 +08:00
|
|
|
struct task_struct *node_kthread_task;
|
|
|
|
/* kthread that takes care of this rcu_node */
|
|
|
|
/* structure, for example, awakening the */
|
|
|
|
/* per-CPU kthreads as needed. */
|
2011-03-30 08:48:28 +08:00
|
|
|
unsigned int node_kthread_status;
|
|
|
|
/* State of node_kthread_task for tracing. */
|
2009-08-23 04:56:45 +08:00
|
|
|
} ____cacheline_internodealigned_in_smp;
|
|
|
|
|
2009-09-28 22:46:33 +08:00
|
|
|
/*
|
|
|
|
* Do a full breadth-first scan of the rcu_node structures for the
|
|
|
|
* specified rcu_state structure.
|
|
|
|
*/
|
|
|
|
#define rcu_for_each_node_breadth_first(rsp, rnp) \
|
|
|
|
for ((rnp) = &(rsp)->node[0]; \
|
|
|
|
(rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
|
|
|
|
|
2009-12-03 04:10:15 +08:00
|
|
|
/*
|
|
|
|
* Do a breadth-first scan of the non-leaf rcu_node structures for the
|
|
|
|
* specified rcu_state structure. Note that if there is a singleton
|
|
|
|
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
|
|
|
|
*/
|
|
|
|
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
|
|
|
|
for ((rnp) = &(rsp)->node[0]; \
|
|
|
|
(rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
|
|
|
|
* structure. Note that if there is a singleton rcu_node tree with but
|
|
|
|
* one rcu_node structure, this loop -will- visit the rcu_node structure.
|
|
|
|
* It is still a leaf node, even if it is also the root node.
|
|
|
|
*/
|
2009-09-28 22:46:33 +08:00
|
|
|
#define rcu_for_each_leaf_node(rsp, rnp) \
|
|
|
|
for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
|
|
|
|
(rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
|
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
/* Index values for nxttail array in struct rcu_data. */
|
|
|
|
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
|
|
|
|
#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
|
|
|
|
#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
|
|
|
|
#define RCU_NEXT_TAIL 3
|
|
|
|
#define RCU_NEXT_SIZE 4
|
|
|
|
|
|
|
|
/* Per-CPU data for read-copy update. */
|
|
|
|
struct rcu_data {
|
|
|
|
/* 1) quiescent-state and grace-period handling : */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long completed; /* Track rsp->completed gp number */
|
2009-08-23 04:56:45 +08:00
|
|
|
/* in order to detect GP end. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long gpnum; /* Highest gp number that this CPU */
|
2009-08-23 04:56:45 +08:00
|
|
|
/* is aware of having started. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long passed_quiesc_completed;
|
2009-08-23 04:56:45 +08:00
|
|
|
/* Value of completed at time of qs. */
|
|
|
|
bool passed_quiesc; /* User-mode/idle loop etc. */
|
|
|
|
bool qs_pending; /* Core waits for quiesc state. */
|
|
|
|
bool beenonline; /* CPU online at least once. */
|
2011-03-03 05:15:15 +08:00
|
|
|
bool preemptible; /* Preemptible RCU? */
|
2009-08-23 04:56:45 +08:00
|
|
|
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
|
|
|
|
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
|
|
|
|
|
|
|
|
/* 2) batch handling */
|
|
|
|
/*
|
|
|
|
* If nxtlist is not NULL, it is partitioned as follows.
|
|
|
|
* Any of the partitions might be empty, in which case the
|
|
|
|
* pointer to that partition will be equal to the pointer for
|
|
|
|
* the following partition. When the list is empty, all of
|
2009-09-24 00:50:42 +08:00
|
|
|
* the nxttail elements point to the ->nxtlist pointer itself,
|
|
|
|
* which in that case is NULL.
|
2009-08-23 04:56:45 +08:00
|
|
|
*
|
|
|
|
* [nxtlist, *nxttail[RCU_DONE_TAIL]):
|
|
|
|
* Entries that batch # <= ->completed
|
|
|
|
* The grace period for these entries has completed, and
|
|
|
|
* the other grace-period-completed entries may be moved
|
|
|
|
* here temporarily in rcu_process_callbacks().
|
2009-09-24 00:50:42 +08:00
|
|
|
* [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
|
|
|
|
* Entries that batch # <= ->completed - 1: waiting for current GP
|
|
|
|
* [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
|
|
|
|
* Entries known to have arrived before current GP ended
|
|
|
|
* [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
|
|
|
|
* Entries that might have arrived after current GP ended
|
|
|
|
* Note that the value of *nxttail[RCU_NEXT_TAIL] will
|
|
|
|
* always be NULL, as this is the end of the list.
|
2009-08-23 04:56:45 +08:00
|
|
|
*/
|
|
|
|
struct rcu_head *nxtlist;
|
|
|
|
struct rcu_head **nxttail[RCU_NEXT_SIZE];
|
2009-09-19 01:28:19 +08:00
|
|
|
long qlen; /* # of queued callbacks */
|
2009-10-15 01:15:55 +08:00
|
|
|
long qlen_last_fqs_check;
|
|
|
|
/* qlen at last check for QS forcing */
|
2010-09-08 05:23:09 +08:00
|
|
|
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
|
2010-10-20 14:13:06 +08:00
|
|
|
unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
|
|
|
|
unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
|
2009-10-15 01:15:55 +08:00
|
|
|
unsigned long n_force_qs_snap;
|
|
|
|
/* did other CPU force QS recently? */
|
2009-08-23 04:56:45 +08:00
|
|
|
long blimit; /* Upper limit on a processed batch */
|
|
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
/* 3) dynticks interface. */
|
|
|
|
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
|
|
|
|
int dynticks_snap; /* Per-GP tracking for dynticks. */
|
|
|
|
#endif /* #ifdef CONFIG_NO_HZ */
|
|
|
|
|
|
|
|
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
|
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
|
|
|
|
#endif /* #ifdef CONFIG_NO_HZ */
|
|
|
|
unsigned long offline_fqs; /* Kicked due to being offline. */
|
|
|
|
unsigned long resched_ipi; /* Sent a resched IPI. */
|
|
|
|
|
|
|
|
/* 5) __rcu_pending() statistics. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
|
|
|
|
unsigned long n_rp_qs_pending;
|
2010-04-15 08:39:26 +08:00
|
|
|
unsigned long n_rp_report_qs;
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long n_rp_cb_ready;
|
|
|
|
unsigned long n_rp_cpu_needs_gp;
|
|
|
|
unsigned long n_rp_gp_completed;
|
|
|
|
unsigned long n_rp_gp_started;
|
|
|
|
unsigned long n_rp_need_fqs;
|
|
|
|
unsigned long n_rp_need_nothing;
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
int cpu;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Values for signaled field in struct rcu_state. */
|
rcu: Fix long-grace-period race between forcing and initialization
Very long RCU read-side critical sections (50 milliseconds or
so) can cause a race between force_quiescent_state() and
rcu_start_gp() as follows on kernel builds with multi-level
rcu_node hierarchies:
1. CPU 0 calls force_quiescent_state(), sees that there is a
grace period in progress, and acquires ->fsqlock.
2. CPU 1 detects the end of the grace period, and so
cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum.
This operation is carried out under the root rnp->lock,
but CPU 0 has not yet acquired that lock. Note that
rsp->signaled is still RCU_SAVE_DYNTICK from the last
grace period.
3. CPU 1 calls rcu_start_gp(), but no one wants a new grace
period, so it drops the root rnp->lock and returns.
4. CPU 0 acquires the root rnp->lock and picks up rsp->completed
and rsp->signaled, then drops rnp->lock. It then enters the
RCU_SAVE_DYNTICK leg of the switch statement.
5. CPU 2 invokes call_rcu(), and now needs a new grace period.
It calls rcu_start_gp(), which acquires the root rnp->lock, sets
rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in
the RCU_SAVE_DYNTICK leg of the switch statement!) and starts
initializing the rcu_node hierarchy. If there are multiple
levels to the hierarchy, it will drop the root rnp->lock and
initialize the lower levels of the hierarchy.
6. CPU 0 notes that rsp->completed has not changed, which permits
both CPU 2 and CPU 0 to try updating it concurrently. If CPU 0's
update prevails, later calls to force_quiescent_state() can
count old quiescent states against the new grace period, which
can in turn result in premature ending of grace periods.
Not good.
This patch adds an RCU_GP_IDLE state for rsp->signaled that is
set initially at boot time and any time a grace period ends.
This prevents CPU 0 from getting into the workings of
force_quiescent_state() in step 4. Additional locking and
checks prevent the concurrent update of rsp->signaled in step 6.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1256742889199-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-28 23:14:49 +08:00
|
|
|
#define RCU_GP_IDLE 0 /* No grace period in progress. */
|
|
|
|
#define RCU_GP_INIT 1 /* Grace period being initialized. */
|
|
|
|
#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
|
2010-01-05 07:09:07 +08:00
|
|
|
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
|
2009-08-23 04:56:45 +08:00
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
|
|
|
|
#else /* #ifdef CONFIG_NO_HZ */
|
2010-01-05 07:09:07 +08:00
|
|
|
#define RCU_SIGNAL_INIT RCU_FORCE_QS
|
2009-08-23 04:56:45 +08:00
|
|
|
#endif /* #else #ifdef CONFIG_NO_HZ */
|
|
|
|
|
|
|
|
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
|
2010-03-06 07:03:26 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROVE_RCU
|
|
|
|
#define RCU_STALL_DELAY_DELTA (5 * HZ)
|
|
|
|
#else
|
|
|
|
#define RCU_STALL_DELAY_DELTA 0
|
|
|
|
#endif
|
|
|
|
|
2010-06-03 07:21:38 +08:00
|
|
|
#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
|
|
|
|
RCU_STALL_DELAY_DELTA)
|
2010-03-06 07:03:26 +08:00
|
|
|
/* for rsp->jiffies_stall */
|
2010-06-03 07:21:38 +08:00
|
|
|
#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
|
2010-03-06 07:03:26 +08:00
|
|
|
/* for rsp->jiffies_stall */
|
|
|
|
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
|
|
|
|
/* to take at least one */
|
|
|
|
/* scheduling clock irq */
|
|
|
|
/* before ratting on them. */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2011-05-21 07:06:29 +08:00
|
|
|
#define rcu_wait(cond) \
|
|
|
|
do { \
|
|
|
|
for (;;) { \
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE); \
|
|
|
|
if (cond) \
|
|
|
|
break; \
|
|
|
|
schedule(); \
|
|
|
|
} \
|
|
|
|
__set_current_state(TASK_RUNNING); \
|
|
|
|
} while (0)
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RCU global state, including node hierarchy. This hierarchy is
|
|
|
|
* represented in "heap" form in a dense array. The root (first level)
|
|
|
|
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
|
|
|
|
* level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
|
|
|
|
* and the third level in ->node[m+1] and following (->node[m+1] referenced
|
|
|
|
* by ->level[2]). The number of levels is determined by the number of
|
|
|
|
* CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
|
|
|
|
* consisting of a single rcu_node.
|
|
|
|
*/
|
|
|
|
struct rcu_state {
|
|
|
|
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
|
|
|
|
struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
|
|
|
|
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
|
|
|
|
u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
|
2010-06-28 16:25:04 +08:00
|
|
|
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
/* The following fields are guarded by the root rcu_node's lock. */
|
|
|
|
|
|
|
|
u8 signaled ____cacheline_internodealigned_in_smp;
|
|
|
|
/* Force QS state. */
|
2010-01-05 07:09:02 +08:00
|
|
|
u8 fqs_active; /* force_quiescent_state() */
|
|
|
|
/* is running. */
|
2010-01-05 07:09:09 +08:00
|
|
|
u8 fqs_need_gp; /* A CPU was prevented from */
|
|
|
|
/* starting a new grace */
|
|
|
|
/* period because */
|
|
|
|
/* force_quiescent_state() */
|
|
|
|
/* was running. */
|
2010-02-23 09:05:01 +08:00
|
|
|
unsigned long gpnum; /* Current gp number. */
|
|
|
|
unsigned long completed; /* # of last completed gp. */
|
2009-09-24 00:50:42 +08:00
|
|
|
|
2009-12-03 04:10:15 +08:00
|
|
|
/* End of fields guarded by root rcu_node's lock. */
|
2009-09-24 00:50:42 +08:00
|
|
|
|
2010-02-23 09:05:02 +08:00
|
|
|
raw_spinlock_t onofflock; /* exclude on/offline and */
|
2010-10-20 14:13:06 +08:00
|
|
|
/* starting new GP. */
|
2010-02-23 09:05:02 +08:00
|
|
|
raw_spinlock_t fqslock; /* Only one task forcing */
|
2009-08-23 04:56:45 +08:00
|
|
|
/* quiescent states. */
|
|
|
|
unsigned long jiffies_force_qs; /* Time at which to invoke */
|
|
|
|
/* force_quiescent_state(). */
|
|
|
|
unsigned long n_force_qs; /* Number of calls to */
|
|
|
|
/* force_quiescent_state(). */
|
|
|
|
unsigned long n_force_qs_lh; /* ~Number of calls leaving */
|
|
|
|
/* due to lock unavailable. */
|
|
|
|
unsigned long n_force_qs_ngp; /* Number of calls leaving */
|
|
|
|
/* due to no GP active. */
|
|
|
|
unsigned long gp_start; /* Time at which GP started, */
|
|
|
|
/* but in jiffies. */
|
|
|
|
unsigned long jiffies_stall; /* Time at which to check */
|
|
|
|
/* for CPU stalls. */
|
2011-04-07 07:01:16 +08:00
|
|
|
unsigned long gp_max; /* Maximum GP duration in */
|
|
|
|
/* jiffies. */
|
2010-04-14 07:18:22 +08:00
|
|
|
char *name; /* Name of structure. */
|
2009-08-23 04:56:45 +08:00
|
|
|
};
|
|
|
|
|
2009-12-03 04:10:15 +08:00
|
|
|
/* Return values for rcu_preempt_offline_tasks(). */
|
|
|
|
|
|
|
|
#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
|
|
|
|
/* GP were moved to root. */
|
|
|
|
#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
|
|
|
|
/* GP were moved to root. */
|
|
|
|
|
2009-03-25 23:42:24 +08:00
|
|
|
/*
|
|
|
|
* RCU implementation internal declarations:
|
|
|
|
*/
|
2009-08-23 04:56:46 +08:00
|
|
|
extern struct rcu_state rcu_sched_state;
|
|
|
|
DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
|
2009-03-25 23:42:24 +08:00
|
|
|
|
|
|
|
extern struct rcu_state rcu_bh_state;
|
|
|
|
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
|
|
|
|
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
#ifdef CONFIG_TREE_PREEMPT_RCU
|
|
|
|
extern struct rcu_state rcu_preempt_state;
|
|
|
|
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
|
|
|
|
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
|
|
|
|
|
2010-01-15 08:10:58 +08:00
|
|
|
#ifndef RCU_TREE_NONCORE
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2009-09-24 00:50:43 +08:00
|
|
|
/* Forward declarations for rcutree_plugin.h */
|
2009-11-11 05:37:19 +08:00
|
|
|
static void rcu_bootup_announce(void);
|
2009-09-24 00:50:43 +08:00
|
|
|
long rcu_batches_completed(void);
|
|
|
|
static void rcu_preempt_note_context_switch(int cpu);
|
2011-02-08 04:47:15 +08:00
|
|
|
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
|
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
2009-12-03 04:10:13 +08:00
|
|
|
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
|
|
|
|
unsigned long flags);
|
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
|
|
|
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
|
2010-02-23 09:05:05 +08:00
|
|
|
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
|
2009-09-24 00:50:43 +08:00
|
|
|
static void rcu_print_task_stall(struct rcu_node *rnp);
|
2010-08-11 05:28:53 +08:00
|
|
|
static void rcu_preempt_stall_reset(void);
|
2009-09-24 00:50:43 +08:00
|
|
|
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang
If the following sequence of events occurs, then
TREE_PREEMPT_RCU will hang waiting for a grace period to
complete, eventually OOMing the system:
o A TREE_PREEMPT_RCU build of the kernel is booted on a system
with more than 64 physical CPUs present (32 on a 32-bit system).
Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted
with RCU_FANOUT set to a sufficiently small value that the
physical CPUs populate two or more leaf rcu_node structures.
o A task is preempted in an RCU read-side critical section
while running on a CPU corresponding to a given leaf rcu_node
structure.
o All CPUs corresponding to this same leaf rcu_node structure
record quiescent states for the current grace period.
o All of these same CPUs go offline (hence the need for enough
physical CPUs to populate more than one leaf rcu_node structure).
This causes the preempted task to be moved to the root rcu_node
structure.
At this point, there is nothing left to cause the quiescent
state to be propagated up the rcu_node tree, so the current
grace period never completes.
The simplest fix, especially after considering the deadlock
possibilities, is to detect this situation when the last CPU is
offlined, and to set that CPU's ->qsmask bit in its leaf
rcu_node structure. This will cause the next invocation of
force_quiescent_state() to end the grace period.
Without this fix, this hang can be triggered in an hour or so on
some machines with rcutorture and random CPU onlining/offlining.
With this fix, these same machines pass a full 10 hours of this
sort of abuse.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-16 00:26:14 +08:00
|
|
|
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
|
|
|
|
struct rcu_node *rnp,
|
|
|
|
struct rcu_data *rdp);
|
2009-09-24 00:50:43 +08:00
|
|
|
static void rcu_preempt_offline_cpu(int cpu);
|
|
|
|
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
|
|
|
|
static void rcu_preempt_check_callbacks(int cpu);
|
|
|
|
static void rcu_preempt_process_callbacks(void);
|
rcu: Use softirq to address performance regression
Commit a26ac2455ffcf3(rcu: move TREE_RCU from softirq to kthread)
introduced performance regression. In an AIM7 test, this commit degraded
performance by about 40%.
The commit runs rcu callbacks in a kthread instead of softirq. We observed
high rate of context switch which is caused by this. Out test system has
64 CPUs and HZ is 1000, so we saw more than 64k context switch per second
which is caused by RCU's per-CPU kthread. A trace showed that most of
the time the RCU per-CPU kthread doesn't actually handle any callbacks,
but instead just does a very small amount of work handling grace periods.
This means that RCU's per-CPU kthreads are making the scheduler do quite
a bit of work in order to allow a very small amount of RCU-related
processing to be done.
Alex Shi's analysis determined that this slowdown is due to lock
contention within the scheduler. Unfortunately, as Peter Zijlstra points
out, the scheduler's real-time semantics require global action, which
means that this contention is inherent in real-time scheduling. (Yes,
perhaps someone will come up with a workaround -- otherwise, -rt is not
going to do well on large SMP systems -- but this patch will work around
this issue in the meantime. And "the meantime" might well be forever.)
This patch therefore re-introduces softirq processing to RCU, but only
for core RCU work. RCU callbacks are still executed in kthread context,
so that only a small amount of RCU work runs in softirq context in the
common case. This should minimize ksoftirqd execution, allowing us to
skip boosting of ksoftirqd for CONFIG_RCU_BOOST=y kernels.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Tested-by: "Alex,Shi" <alex.shi@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-06-14 13:26:25 +08:00
|
|
|
static void rcu_preempt_do_callbacks(void);
|
2009-09-24 00:50:43 +08:00
|
|
|
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
|
2009-12-03 04:10:15 +08:00
|
|
|
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
|
|
|
|
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
|
|
|
|
#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
|
2009-09-24 00:50:43 +08:00
|
|
|
static int rcu_preempt_pending(int cpu);
|
|
|
|
static int rcu_preempt_needs_cpu(int cpu);
|
|
|
|
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
|
2010-10-20 14:13:06 +08:00
|
|
|
static void rcu_preempt_send_cbs_to_online(void);
|
2009-09-24 00:50:43 +08:00
|
|
|
static void __init __rcu_init_preempt(void);
|
2010-02-27 08:38:56 +08:00
|
|
|
static void rcu_needs_cpu_flush(void);
|
2011-05-05 12:43:49 +08:00
|
|
|
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
|
2011-02-08 04:47:15 +08:00
|
|
|
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
|
|
|
|
cpumask_var_t cm);
|
|
|
|
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
|
|
|
|
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
|
|
|
|
struct rcu_node *rnp,
|
|
|
|
int rnp_index);
|
2009-09-24 00:50:43 +08:00
|
|
|
|
2010-01-15 08:10:58 +08:00
|
|
|
#endif /* #ifndef RCU_TREE_NONCORE */
|