linux/kernel/rcutree.h

475 lines
19 KiB
C
Raw Normal View History

/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2008
*
* Author: Ingo Molnar <mingo@elte.hu>
* Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
/*
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
* In theory, it should be possible to add more levels straightforwardly.
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
#if CONFIG_RCU_FANOUT > 16
#define RCU_FANOUT_LEAF 16
#else /* #if CONFIG_RCU_FANOUT > 16 */
#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
# define NUM_RCU_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_2
# define NUM_RCU_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_3
# define NUM_RCU_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_4
# define NUM_RCU_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_4 (NR_CPUS)
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
/*
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
rcu: Track idleness independent of idle tasks Earlier versions of RCU used the scheduling-clock tick to detect idleness by checking for the idle task, but handled idleness differently for CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side critical sections in the idle task, for example, for tracing. A more fine-grained detection of idleness is therefore required. This commit presses the old dyntick-idle code into full-time service, so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is always invoked at the beginning of an idle loop iteration. Similarly, rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked at the end of an idle-loop iteration. This allows the idle task to use RCU everywhere except between consecutive rcu_idle_enter() and rcu_idle_exit() calls, in turn allowing architecture maintainers to specify exactly where in the idle loop that RCU may be used. Because some of the userspace upcall uses can result in what looks to RCU like half of an interrupt, it is not possible to expect that the irq_enter() and irq_exit() hooks will give exact counts. This patch therefore expands the ->dynticks_nesting counter to 64 bits and uses two separate bitfields to count process/idle transitions and interrupt entry/exit transitions. It is presumed that userspace upcalls do not happen in the idle loop or from usermode execution (though usermode might do a system call that results in an upcall). The counter is hard-reset on each process/idle transition, which avoids the interrupt entry/exit error from accumulating. Overflow is avoided by the 64-bitness of the ->dyntick_nesting counter. This commit also adds warnings if a non-idle task asks RCU to enter idle state (and these checks will need some adjustment before applying Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246). In addition, validation of ->dynticks and ->dynticks_nesting is added. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-01 03:10:22 +08:00
long long dynticks_nesting; /* Track irq/process nesting level. */
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
};
/* RCU's kthread states for tracing. */
#define RCU_KTHREAD_STOPPED 0
#define RCU_KTHREAD_RUNNING 1
#define RCU_KTHREAD_WAITING 2
#define RCU_KTHREAD_OFFCPU 3
#define RCU_KTHREAD_YIELDING 4
#define RCU_KTHREAD_MAX 4
/*
* Definition for node within the RCU grace-period-detection hierarchy.
*/
struct rcu_node {
raw_spinlock_t lock; /* Root rcu_node's lock protects some */
/* rcu_state fields as well as following. */
unsigned long gpnum; /* Current grace period for this node. */
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
unsigned long completed; /* Last GP completed for this node. */
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
/* order for current grace period to proceed.*/
/* In leaf rcu_node, each bit corresponds to */
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
rcu: Avoid acquiring rcu_node locks in timer functions This commit switches manipulations of the rcu_node ->wakemask field to atomic operations, which allows rcu_cpu_kthread_timer() to avoid acquiring the rcu_node lock. This should avoid the following lockdep splat reported by Valdis Kletnieks: [ 12.872150] usb 1-4: new high speed USB device number 3 using ehci_hcd [ 12.986667] usb 1-4: New USB device found, idVendor=413c, idProduct=2513 [ 12.986679] usb 1-4: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 12.987691] hub 1-4:1.0: USB hub found [ 12.987877] hub 1-4:1.0: 3 ports detected [ 12.996372] input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input10 [ 13.071471] udevadm used greatest stack depth: 3984 bytes left [ 13.172129] [ 13.172130] ======================================================= [ 13.172425] [ INFO: possible circular locking dependency detected ] [ 13.172650] 2.6.39-rc6-mmotm0506 #1 [ 13.172773] ------------------------------------------------------- [ 13.172997] blkid/267 is trying to acquire lock: [ 13.173009] (&p->pi_lock){-.-.-.}, at: [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa [ 13.173009] [ 13.173009] but task is already holding lock: [ 13.173009] (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58 [ 13.173009] [ 13.173009] which lock already depends on the new lock. [ 13.173009] [ 13.173009] [ 13.173009] the existing dependency chain (in reverse order) is: [ 13.173009] [ 13.173009] -> #2 (rcu_node_level_0){..-...}: [ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104 [ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab [ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2 [ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c [ 13.173009] [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45 [ 13.173009] [<ffffffff81090794>] rcu_read_unlock_special+0x8c/0x1d5 [ 13.173009] [<ffffffff8109092c>] __rcu_read_unlock+0x4f/0xd7 [ 13.173009] [<ffffffff81027bd3>] rcu_read_unlock+0x21/0x23 [ 13.173009] [<ffffffff8102cc34>] cpuacct_charge+0x6c/0x75 [ 13.173009] [<ffffffff81030cc6>] update_curr+0x101/0x12e [ 13.173009] [<ffffffff810311d0>] check_preempt_wakeup+0xf7/0x23b [ 13.173009] [<ffffffff8102acb3>] check_preempt_curr+0x2b/0x68 [ 13.173009] [<ffffffff81031d40>] ttwu_do_wakeup+0x76/0x128 [ 13.173009] [<ffffffff81031e49>] ttwu_do_activate.constprop.63+0x57/0x5c [ 13.173009] [<ffffffff81031e96>] scheduler_ipi+0x48/0x5d [ 13.173009] [<ffffffff810177d5>] smp_reschedule_interrupt+0x16/0x18 [ 13.173009] [<ffffffff815710f3>] reschedule_interrupt+0x13/0x20 [ 13.173009] [<ffffffff810b66d1>] rcu_read_unlock+0x21/0x23 [ 13.173009] [<ffffffff810b739c>] find_get_page+0xa9/0xb9 [ 13.173009] [<ffffffff810b8b48>] filemap_fault+0x6a/0x34d [ 13.173009] [<ffffffff810d1a25>] __do_fault+0x54/0x3e6 [ 13.173009] [<ffffffff810d447a>] handle_pte_fault+0x12c/0x1ed [ 13.173009] [<ffffffff810d48f7>] handle_mm_fault+0x1cd/0x1e0 [ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de [ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30 [ 13.173009] [ 13.173009] -> #1 (&rq->lock){-.-.-.}: [ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104 [ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab [ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2 [ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c [ 13.173009] [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45 [ 13.173009] [<ffffffff81027e19>] __task_rq_lock+0x8b/0xd3 [ 13.173009] [<ffffffff81032f7f>] wake_up_new_task+0x41/0x108 [ 13.173009] [<ffffffff810376c3>] do_fork+0x265/0x33f [ 13.173009] [<ffffffff81007d02>] kernel_thread+0x6b/0x6d [ 13.173009] [<ffffffff8153a9dd>] rest_init+0x21/0xd2 [ 13.173009] [<ffffffff81b1db4f>] start_kernel+0x3bb/0x3c6 [ 13.173009] [<ffffffff81b1d29f>] x86_64_start_reservations+0xaf/0xb3 [ 13.173009] [<ffffffff81b1d393>] x86_64_start_kernel+0xf0/0xf7 [ 13.173009] [ 13.173009] -> #0 (&p->pi_lock){-.-.-.}: [ 13.173009] [<ffffffff81067788>] check_prev_add+0x68/0x20e [ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104 [ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab [ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2 [ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c [ 13.173009] [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57 [ 13.173009] [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa [ 13.173009] [<ffffffff81032f3c>] wake_up_process+0x10/0x12 [ 13.173009] [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58 [ 13.173009] [<ffffffff81045286>] call_timer_fn+0xac/0x1e9 [ 13.173009] [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2 [ 13.173009] [<ffffffff8103e487>] __do_softirq+0x109/0x26a [ 13.173009] [<ffffffff8157144c>] call_softirq+0x1c/0x30 [ 13.173009] [<ffffffff81003207>] do_softirq+0x44/0xf1 [ 13.173009] [<ffffffff8103e8b9>] irq_exit+0x58/0xc8 [ 13.173009] [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87 [ 13.173009] [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20 [ 13.173009] [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310 [ 13.173009] [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243 [ 13.173009] [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a [ 13.173009] [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b [ 13.173009] [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0 [ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de [ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30 [ 13.173009] [ 13.173009] other info that might help us debug this: [ 13.173009] [ 13.173009] Chain exists of: [ 13.173009] &p->pi_lock --> &rq->lock --> rcu_node_level_0 [ 13.173009] [ 13.173009] Possible unsafe locking scenario: [ 13.173009] [ 13.173009] CPU0 CPU1 [ 13.173009] ---- ---- [ 13.173009] lock(rcu_node_level_0); [ 13.173009] lock(&rq->lock); [ 13.173009] lock(rcu_node_level_0); [ 13.173009] lock(&p->pi_lock); [ 13.173009] [ 13.173009] *** DEADLOCK *** [ 13.173009] [ 13.173009] 3 locks held by blkid/267: [ 13.173009] #0: (&mm->mmap_sem){++++++}, at: [<ffffffff8156cdb4>] do_page_fault+0x1f3/0x5de [ 13.173009] #1: (&yield_timer){+.-...}, at: [<ffffffff810451da>] call_timer_fn+0x0/0x1e9 [ 13.173009] #2: (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58 [ 13.173009] [ 13.173009] stack backtrace: [ 13.173009] Pid: 267, comm: blkid Not tainted 2.6.39-rc6-mmotm0506 #1 [ 13.173009] Call Trace: [ 13.173009] <IRQ> [<ffffffff8154a529>] print_circular_bug+0xc8/0xd9 [ 13.173009] [<ffffffff81067788>] check_prev_add+0x68/0x20e [ 13.173009] [<ffffffff8100c861>] ? save_stack_trace+0x28/0x46 [ 13.173009] [<ffffffff810679b9>] check_prevs_add+0x8b/0x104 [ 13.173009] [<ffffffff81067da1>] validate_chain+0x36f/0x3ab [ 13.173009] [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2 [ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c [ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57 [ 13.173009] [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa [ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [<ffffffff81032f3c>] wake_up_process+0x10/0x12 [ 13.173009] [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58 [ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [<ffffffff81045286>] call_timer_fn+0xac/0x1e9 [ 13.173009] [<ffffffff810451da>] ? del_timer+0x75/0x75 [ 13.173009] [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2 [ 13.173009] [<ffffffff8103e487>] __do_softirq+0x109/0x26a [ 13.173009] [<ffffffff8106365f>] ? tick_dev_program_event+0x37/0xf6 [ 13.173009] [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f [ 13.173009] [<ffffffff8157144c>] call_softirq+0x1c/0x30 [ 13.173009] [<ffffffff81003207>] do_softirq+0x44/0xf1 [ 13.173009] [<ffffffff8103e8b9>] irq_exit+0x58/0xc8 [ 13.173009] [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87 [ 13.173009] [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20 [ 13.173009] <EOI> [<ffffffff810bd384>] ? get_page_from_freelist+0x114/0x310 [ 13.173009] [<ffffffff810bd51a>] ? get_page_from_freelist+0x2aa/0x310 [ 13.173009] [<ffffffff812220e7>] ? clear_page_c+0x7/0x10 [ 13.173009] [<ffffffff810bd1ef>] ? prep_new_page+0x14c/0x1cd [ 13.173009] [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310 [ 13.173009] [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243 [ 13.173009] [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99 [ 13.173009] [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a [ 13.173009] [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99 [ 13.173009] [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b [ 13.173009] [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0 [ 13.173009] [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de [ 13.173009] [<ffffffff810d915f>] ? sys_brk+0x32/0x10c [ 13.173009] [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f [ 13.173009] [<ffffffff81065c4f>] ? trace_hardirqs_off_caller+0x3f/0x9c [ 13.173009] [<ffffffff812235dd>] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 13.173009] [<ffffffff8156a75f>] page_fault+0x1f/0x30 [ 14.010075] usb 5-1: new full speed USB device number 2 using uhci_hcd Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-11 20:41:41 +08:00
atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
/* Since this has meaning only for leaf */
/* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
int grphi; /* highest-numbered CPU or group here. */
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* Tasks blocked in RCU read-side critical */
/* section. Tasks are placed at the head */
/* of this list and age towards the tail. */
struct list_head *gp_tasks;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
/* is no such task. */
struct list_head *exp_tasks;
/* Pointer to the first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
/* boosting is needed for this rcu_node */
/* structure. If there are no tasks */
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_tasks_boosted;
/* Total number of tasks boosted. */
unsigned long n_exp_boosts;
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
unsigned long n_balk_blkd_tasks;
/* Refused to boost: no blocked tasks. */
unsigned long n_balk_exp_gp_tasks;
/* Refused to boost: nothing blocking GP. */
unsigned long n_balk_boost_tasks;
/* Refused to boost: already boosting. */
unsigned long n_balk_notblocked;
/* Refused to boost: RCU RS CS still running. */
unsigned long n_balk_notyet;
/* Refused to boost: not yet time. */
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
struct task_struct *node_kthread_task;
/* kthread that takes care of this rcu_node */
/* structure, for example, awakening the */
/* per-CPU kthreads as needed. */
unsigned int node_kthread_status;
/* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
*/
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
(rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the
* specified rcu_state structure. Note that if there is a singleton
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
(rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
* structure. Note that if there is a singleton rcu_node tree with but
* one rcu_node structure, this loop -will- visit the rcu_node structure.
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
(rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
/* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
#define RCU_NEXT_TAIL 3
#define RCU_NEXT_SIZE 4
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
unsigned long completed; /* Track rsp->completed gp number */
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
rcu: Simplify quiescent-state accounting There is often a delay between the time that a CPU passes through a quiescent state and the time that this quiescent state is reported to the RCU core. It is quite possible that the grace period ended before the quiescent state could be reported, for example, some other CPU might have deduced that this CPU passed through dyntick-idle mode. It is critically important that quiescent state be counted only against the grace period that was in effect at the time that the quiescent state was detected. Previously, this was handled by recording the number of the last grace period to complete when passing through a quiescent state. The RCU core then checks this number against the current value, and rejects the quiescent state if there is a mismatch. However, one additional possibility must be accounted for, namely that the quiescent state was recorded after the prior grace period completed but before the current grace period started. In this case, the RCU core must reject the quiescent state, but the recorded number will match. This is handled when the CPU becomes aware of a new grace period -- at that point, it invalidates any prior quiescent state. This works, but is a bit indirect. The new approach records the current grace period, and the RCU core checks to see (1) that this is still the current grace period and (2) that this grace period has not yet ended. This approach simplifies reasoning about correctness, and this commit changes over to this new approach. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-06-27 15:17:43 +08:00
unsigned long passed_quiesce_gpnum;
/* gpnum at time of quiescent state. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
/* 2) batch handling */
/*
* If nxtlist is not NULL, it is partitioned as follows.
* Any of the partitions might be empty, in which case the
* pointer to that partition will be equal to the pointer for
* the following partition. When the list is empty, all of
* the nxttail elements point to the ->nxtlist pointer itself,
* which in that case is NULL.
*
* [nxtlist, *nxttail[RCU_DONE_TAIL]):
* Entries that batch # <= ->completed
* The grace period for these entries has completed, and
* the other grace-period-completed entries may be moved
* here temporarily in rcu_process_callbacks().
* [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
* Entries that batch # <= ->completed - 1: waiting for current GP
* [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
* Entries known to have arrived before current GP ended
* [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
* Entries that might have arrived after current GP ended
* Note that the value of *nxttail[RCU_NEXT_TAIL] will
* always be NULL, as this is the end of the list.
*/
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
long qlen; /* # of queued callbacks */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
unsigned long resched_ipi; /* Sent a resched IPI. */
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
unsigned long n_rp_qs_pending;
unsigned long n_rp_report_qs;
unsigned long n_rp_cb_ready;
unsigned long n_rp_cpu_needs_gp;
unsigned long n_rp_gp_completed;
unsigned long n_rp_gp_started;
unsigned long n_rp_need_fqs;
unsigned long n_rp_need_nothing;
int cpu;
struct rcu_state *rsp;
};
/* Values for fqs_state field in struct rcu_state. */
rcu: Fix long-grace-period race between forcing and initialization Very long RCU read-side critical sections (50 milliseconds or so) can cause a race between force_quiescent_state() and rcu_start_gp() as follows on kernel builds with multi-level rcu_node hierarchies: 1. CPU 0 calls force_quiescent_state(), sees that there is a grace period in progress, and acquires ->fsqlock. 2. CPU 1 detects the end of the grace period, and so cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum. This operation is carried out under the root rnp->lock, but CPU 0 has not yet acquired that lock. Note that rsp->signaled is still RCU_SAVE_DYNTICK from the last grace period. 3. CPU 1 calls rcu_start_gp(), but no one wants a new grace period, so it drops the root rnp->lock and returns. 4. CPU 0 acquires the root rnp->lock and picks up rsp->completed and rsp->signaled, then drops rnp->lock. It then enters the RCU_SAVE_DYNTICK leg of the switch statement. 5. CPU 2 invokes call_rcu(), and now needs a new grace period. It calls rcu_start_gp(), which acquires the root rnp->lock, sets rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in the RCU_SAVE_DYNTICK leg of the switch statement!) and starts initializing the rcu_node hierarchy. If there are multiple levels to the hierarchy, it will drop the root rnp->lock and initialize the lower levels of the hierarchy. 6. CPU 0 notes that rsp->completed has not changed, which permits both CPU 2 and CPU 0 to try updating it concurrently. If CPU 0's update prevails, later calls to force_quiescent_state() can count old quiescent states against the new grace period, which can in turn result in premature ending of grace periods. Not good. This patch adds an RCU_GP_IDLE state for rsp->signaled that is set initially at boot time and any time a grace period ends. This prevents CPU 0 from getting into the workings of force_quiescent_state() in step 4. Additional locking and checks prevent the concurrent update of rsp->signaled in step 6. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1256742889199-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-28 23:14:49 +08:00
#define RCU_GP_IDLE 0 /* No grace period in progress. */
#define RCU_GP_INIT 1 /* Grace period being initialized. */
#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
#else
#define RCU_STALL_DELAY_DELTA 0
#endif
#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
RCU_STALL_DELAY_DELTA)
/* for rsp->jiffies_stall */
#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
/* for rsp->jiffies_stall */
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
/* to take at least one */
/* scheduling clock irq */
/* before ratting on them. */
#define rcu_wait(cond) \
do { \
for (;;) { \
set_current_state(TASK_INTERRUPTIBLE); \
if (cond) \
break; \
schedule(); \
} \
__set_current_state(TASK_RUNNING); \
} while (0)
/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
* level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
* and the third level in ->node[m+1] and following (->node[m+1] referenced
* by ->level[2]). The number of levels is determined by the number of
* CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
* consisting of a single rcu_node.
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
/* The following fields are guarded by the root rcu_node's lock. */
u8 fqs_state ____cacheline_internodealigned_in_smp;
/* Force QS state. */
u8 fqs_active; /* force_quiescent_state() */
/* is running. */
u8 fqs_need_gp; /* A CPU was prevented from */
/* starting a new grace */
/* period because */
/* force_quiescent_state() */
/* was running. */
u8 boost; /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
/* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */
/* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
unsigned long n_force_qs_lh; /* ~Number of calls leaving */
/* due to lock unavailable. */
unsigned long n_force_qs_ngp; /* Number of calls leaving */
/* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
char *name; /* Name of structure. */
};
/* Return values for rcu_preempt_offline_tasks(). */
#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
/* GP were moved to root. */
#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
/* GP were moved to root. */
/*
* RCU implementation internal declarations:
*/
extern struct rcu_state rcu_sched_state;
DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
#ifdef CONFIG_TREE_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DECLARE_PER_CPU(char, rcu_cpu_has_work);
#endif /* #ifdef CONFIG_RCU_BOOST */
#ifndef RCU_TREE_NONCORE
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
static void rcu_preempt_note_context_switch(int cpu);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
rcu: Fix grace-period-stall bug on large systems with CPU hotplug When the last CPU of a given leaf rcu_node structure goes offline, all of the tasks queued on that leaf rcu_node structure (due to having blocked in their current RCU read-side critical sections) are requeued onto the root rcu_node structure. This requeuing is carried out by rcu_preempt_offline_tasks(). However, it is possible that these queued tasks are the only thing preventing the leaf rcu_node structure from reporting a quiescent state up the rcu_node hierarchy. Unfortunately, the old code would fail to do this reporting, resulting in a grace-period stall given the following sequence of events: 1. Kernel built for more than 32 CPUs on 32-bit systems or for more than 64 CPUs on 64-bit systems, so that there is more than one rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set to a number smaller than CONFIG_NR_CPUS.) 2. The kernel is built with CONFIG_TREE_PREEMPT_RCU. 3. A task running on a CPU associated with a given leaf rcu_node structure blocks while in an RCU read-side critical section -and- that CPU has not yet passed through a quiescent state for the current RCU grace period. This will cause the task to be queued on the leaf rcu_node's blocked_tasks[] array, in particular, on the element of this array corresponding to the current grace period. 4. Each of the remaining CPUs corresponding to this same leaf rcu_node structure pass through a quiescent state. However, the task is still in its RCU read-side critical section, so these quiescent states cannot be reported further up the rcu_node hierarchy. Nevertheless, all bits in the leaf rcu_node structure's ->qsmask field are now zero. 5. Each of the remaining CPUs go offline. (The events in step #4 and #5 can happen in any order as long as each CPU passes through a quiescent state before going offline.) 6. When the last CPU goes offline, __rcu_offline_cpu() will invoke rcu_preempt_offline_tasks(), which will move the task to the root rcu_node structure, but without reporting a quiescent state up the rcu_node hierarchy (and this failure to report a quiescent state is the bug). But because this leaf rcu_node structure's ->qsmask field is already zero and its ->block_tasks[] entries are all empty, force_quiescent_state() will skip this rcu_node structure. Therefore, grace periods are now hung. This patch abstracts some code out of rcu_read_unlock_special(), calling the result task_quiet() by analogy with cpu_quiet(), and invokes task_quiet() from both rcu_read_lock_special() and __rcu_offline_cpu(). Invoking task_quiet() from __rcu_offline_cpu() reports the quiescent state up the rcu_node hierarchy, fixing the bug. This ends up requiring a separate lock_class_key per level of the rcu_node hierarchy, which this patch also provides. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12589088301770-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
#ifdef CONFIG_HOTPLUG_CPU
rcu: Rename "quiet" functions The number of "quiet" functions has grown recently, and the names are no longer very descriptive. The point of all of these functions is to do some portion of the task of reporting a quiescent state, so rename them accordingly: o cpu_quiet() becomes rcu_report_qs_rdp(), which reports a quiescent state to the per-CPU rcu_data structure. If this turns out to be a new quiescent state for this grace period, then rcu_report_qs_rnp() will be invoked to propagate the quiescent state up the rcu_node hierarchy. o cpu_quiet_msk() becomes rcu_report_qs_rnp(), which reports a quiescent state for a given CPU (or possibly a set of CPUs) up the rcu_node hierarchy. o cpu_quiet_msk_finish() becomes rcu_report_qs_rsp(), which reports a full set of quiescent states to the global rcu_state structure. o task_quiet() becomes rcu_report_unblock_qs_rnp(), which reports a quiescent state due to a task exiting an RCU read-side critical section that had previously blocked in that same critical section. As indicated by the new name, this type of quiescent state is reported up the rcu_node hierarchy (using rcu_report_qs_rnp() to do so). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Josh Triplett <josh@joshtriplett.org> Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12597846163698-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-03 04:10:13 +08:00
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
static void rcu_stop_cpu_kthread(int cpu);
rcu: Fix grace-period-stall bug on large systems with CPU hotplug When the last CPU of a given leaf rcu_node structure goes offline, all of the tasks queued on that leaf rcu_node structure (due to having blocked in their current RCU read-side critical sections) are requeued onto the root rcu_node structure. This requeuing is carried out by rcu_preempt_offline_tasks(). However, it is possible that these queued tasks are the only thing preventing the leaf rcu_node structure from reporting a quiescent state up the rcu_node hierarchy. Unfortunately, the old code would fail to do this reporting, resulting in a grace-period stall given the following sequence of events: 1. Kernel built for more than 32 CPUs on 32-bit systems or for more than 64 CPUs on 64-bit systems, so that there is more than one rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set to a number smaller than CONFIG_NR_CPUS.) 2. The kernel is built with CONFIG_TREE_PREEMPT_RCU. 3. A task running on a CPU associated with a given leaf rcu_node structure blocks while in an RCU read-side critical section -and- that CPU has not yet passed through a quiescent state for the current RCU grace period. This will cause the task to be queued on the leaf rcu_node's blocked_tasks[] array, in particular, on the element of this array corresponding to the current grace period. 4. Each of the remaining CPUs corresponding to this same leaf rcu_node structure pass through a quiescent state. However, the task is still in its RCU read-side critical section, so these quiescent states cannot be reported further up the rcu_node hierarchy. Nevertheless, all bits in the leaf rcu_node structure's ->qsmask field are now zero. 5. Each of the remaining CPUs go offline. (The events in step #4 and #5 can happen in any order as long as each CPU passes through a quiescent state before going offline.) 6. When the last CPU goes offline, __rcu_offline_cpu() will invoke rcu_preempt_offline_tasks(), which will move the task to the root rcu_node structure, but without reporting a quiescent state up the rcu_node hierarchy (and this failure to report a quiescent state is the bug). But because this leaf rcu_node structure's ->qsmask field is already zero and its ->block_tasks[] entries are all empty, force_quiescent_state() will skip this rcu_node structure. Therefore, grace periods are now hung. This patch abstracts some code out of rcu_read_unlock_special(), calling the result task_quiet() by analogy with cpu_quiet(), and invokes task_quiet() from both rcu_read_lock_special() and __rcu_offline_cpu(). Invoking task_quiet() from __rcu_offline_cpu() reports the quiescent state up the rcu_node hierarchy, fixing the bug. This ends up requiring a separate lock_class_key per level of the rcu_node hierarchy, which this patch also provides. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12589088301770-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_stall_reset(void);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang If the following sequence of events occurs, then TREE_PREEMPT_RCU will hang waiting for a grace period to complete, eventually OOMing the system: o A TREE_PREEMPT_RCU build of the kernel is booted on a system with more than 64 physical CPUs present (32 on a 32-bit system). Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted with RCU_FANOUT set to a sufficiently small value that the physical CPUs populate two or more leaf rcu_node structures. o A task is preempted in an RCU read-side critical section while running on a CPU corresponding to a given leaf rcu_node structure. o All CPUs corresponding to this same leaf rcu_node structure record quiescent states for the current grace period. o All of these same CPUs go offline (hence the need for enough physical CPUs to populate more than one leaf rcu_node structure). This causes the preempted task to be moved to the root rcu_node structure. At this point, there is nothing left to cause the quiescent state to be propagated up the rcu_node tree, so the current grace period never completes. The simplest fix, especially after considering the deadlock possibilities, is to detect this situation when the last CPU is offlined, and to set that CPU's ->qsmask bit in its leaf rcu_node structure. This will cause the next invocation of force_quiescent_state() to end the grace period. Without this fix, this hang can be triggered in an hour or so on some machines with rcutorture and random CPU onlining/offlining. With this fix, these same machines pass a full 10 hours of this sort of abuse. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-16 00:26:14 +08:00
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
static void rcu_preempt_offline_cpu(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_preempt_check_callbacks(int cpu);
static void rcu_preempt_process_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
cpumask_var_t cm);
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp,
int rnp_index);
static void invoke_rcu_node_kthread(struct rcu_node *rnp);
static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
#endif /* #ifndef RCU_TREE_NONCORE */