rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock()
Commitbb73c52bad
("rcu: Don't disable preemption for Tiny and Tree RCU readers") removed the barrier() calls from rcu_read_lock() and rcu_write_lock() in CONFIG_PREEMPT=n&&CONFIG_PREEMPT_COUNT=n kernels. Within RCU, this commit was OK, but it failed to account for things like get_user() that can pagefault and that can be reordered by the compiler. Lack of the barrier() calls in rcu_read_lock() and rcu_read_unlock() can cause these page faults to migrate into RCU read-side critical sections, which in CONFIG_PREEMPT=n kernels could result in too-short grace periods and arbitrary misbehavior. Please see commit386afc9114
("spinlocks and preemption points need to be at least compiler barriers") and Linus's commit66be4e66a7
("rcu: locking and unlocking need to always be at least barriers"), this last of which restores the barrier() call to both rcu_read_lock() and rcu_read_unlock(). This commit removes barrier() calls that are no longer needed given that the addition of them in Linus's commit noted above. The combination of this commit and Linus's commit effectively reverts commitbb73c52bad
("rcu: Don't disable preemption for Tiny and Tree RCU readers"). Reported-by: Herbert Xu <herbert@gondor.apana.org.au> Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com> [ paulmck: Fix embarrassing typo located by Alan Stern. ]
This commit is contained in:
parent
b55bd58555
commit
1f3ebc8253
|
@ -2129,6 +2129,8 @@ Some of the relevant points of interest are as follows:
|
||||||
<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
|
<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
|
||||||
<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
|
<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
|
||||||
<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
|
<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
|
||||||
|
<li> <a href="#Accesses to User Memory and RCU">
|
||||||
|
Accesses to User Memory and RCU</a>.
|
||||||
<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
|
<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
|
||||||
<li> <a href="#Scheduling-Clock Interrupts and RCU">
|
<li> <a href="#Scheduling-Clock Interrupts and RCU">
|
||||||
Scheduling-Clock Interrupts and RCU</a>.
|
Scheduling-Clock Interrupts and RCU</a>.
|
||||||
|
@ -2521,6 +2523,75 @@ cannot be used.
|
||||||
The tracing folks both located the requirement and provided the
|
The tracing folks both located the requirement and provided the
|
||||||
needed fix, so this surprise requirement was relatively painless.
|
needed fix, so this surprise requirement was relatively painless.
|
||||||
|
|
||||||
|
<h3><a name="Accesses to User Memory and RCU">
|
||||||
|
Accesses to User Memory and RCU</a></h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The kernel needs to access user-space memory, for example, to access
|
||||||
|
data referenced by system-call parameters.
|
||||||
|
The <tt>get_user()</tt> macro does this job.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
However, user-space memory might well be paged out, which means
|
||||||
|
that <tt>get_user()</tt> might well page-fault and thus block while
|
||||||
|
waiting for the resulting I/O to complete.
|
||||||
|
It would be a very bad thing for the compiler to reorder
|
||||||
|
a <tt>get_user()</tt> invocation into an RCU read-side critical
|
||||||
|
section.
|
||||||
|
For example, suppose that the source code looked like this:
|
||||||
|
|
||||||
|
<blockquote>
|
||||||
|
<pre>
|
||||||
|
1 rcu_read_lock();
|
||||||
|
2 p = rcu_dereference(gp);
|
||||||
|
3 v = p->value;
|
||||||
|
4 rcu_read_unlock();
|
||||||
|
5 get_user(user_v, user_p);
|
||||||
|
6 do_something_with(v, user_v);
|
||||||
|
</pre>
|
||||||
|
</blockquote>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The compiler must not be permitted to transform this source code into
|
||||||
|
the following:
|
||||||
|
|
||||||
|
<blockquote>
|
||||||
|
<pre>
|
||||||
|
1 rcu_read_lock();
|
||||||
|
2 p = rcu_dereference(gp);
|
||||||
|
3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
|
||||||
|
4 v = p->value;
|
||||||
|
5 rcu_read_unlock();
|
||||||
|
6 do_something_with(v, user_v);
|
||||||
|
</pre>
|
||||||
|
</blockquote>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If the compiler did make this transformation in a
|
||||||
|
<tt>CONFIG_PREEMPT=n</tt> kernel build, and if <tt>get_user()</tt> did
|
||||||
|
page fault, the result would be a quiescent state in the middle
|
||||||
|
of an RCU read-side critical section.
|
||||||
|
This misplaced quiescent state could result in line 4 being
|
||||||
|
a use-after-free access, which could be bad for your kernel's
|
||||||
|
actuarial statistics.
|
||||||
|
Similar examples can be constructed with the call to <tt>get_user()</tt>
|
||||||
|
preceding the <tt>rcu_read_lock()</tt>.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Unfortunately, <tt>get_user()</tt> doesn't have any particular
|
||||||
|
ordering properties, and in some architectures the underlying <tt>asm</tt>
|
||||||
|
isn't even marked <tt>volatile</tt>.
|
||||||
|
And even if it was marked <tt>volatile</tt>, the above access to
|
||||||
|
<tt>p->value</tt> is not volatile, so the compiler would not have any
|
||||||
|
reason to keep those two accesses in order.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Therefore, the Linux-kernel definitions of <tt>rcu_read_lock()</tt>
|
||||||
|
and <tt>rcu_read_unlock()</tt> must act as compiler barriers,
|
||||||
|
at least for outermost instances of <tt>rcu_read_lock()</tt> and
|
||||||
|
<tt>rcu_read_unlock()</tt> within a nested set of RCU read-side critical
|
||||||
|
sections.
|
||||||
|
|
||||||
<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
|
<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)
|
||||||
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
|
||||||
struct rcu_node *rnp;
|
struct rcu_node *rnp;
|
||||||
|
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking down. */
|
|
||||||
trace_rcu_utilization(TPS("Start context switch"));
|
trace_rcu_utilization(TPS("Start context switch"));
|
||||||
lockdep_assert_irqs_disabled();
|
lockdep_assert_irqs_disabled();
|
||||||
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
|
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
|
||||||
|
@ -340,7 +339,6 @@ void rcu_note_context_switch(bool preempt)
|
||||||
if (rdp->exp_deferred_qs)
|
if (rdp->exp_deferred_qs)
|
||||||
rcu_report_exp_rdp(rdp);
|
rcu_report_exp_rdp(rdp);
|
||||||
trace_rcu_utilization(TPS("End context switch"));
|
trace_rcu_utilization(TPS("End context switch"));
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking up. */
|
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
|
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
|
||||||
|
|
||||||
|
@ -828,11 +826,6 @@ static void rcu_qs(void)
|
||||||
* dyntick-idle quiescent state visible to other CPUs, which will in
|
* dyntick-idle quiescent state visible to other CPUs, which will in
|
||||||
* some cases serve for expedited as well as normal grace periods.
|
* some cases serve for expedited as well as normal grace periods.
|
||||||
* Either way, register a lightweight quiescent state.
|
* Either way, register a lightweight quiescent state.
|
||||||
*
|
|
||||||
* The barrier() calls are redundant in the common case when this is
|
|
||||||
* called externally, but just in case this is called from within this
|
|
||||||
* file.
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
void rcu_all_qs(void)
|
void rcu_all_qs(void)
|
||||||
{
|
{
|
||||||
|
@ -847,14 +840,12 @@ void rcu_all_qs(void)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
this_cpu_write(rcu_data.rcu_urgent_qs, false);
|
this_cpu_write(rcu_data.rcu_urgent_qs, false);
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking down. */
|
|
||||||
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
|
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
|
||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
rcu_momentary_dyntick_idle();
|
rcu_momentary_dyntick_idle();
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
}
|
}
|
||||||
rcu_qs();
|
rcu_qs();
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking up. */
|
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(rcu_all_qs);
|
EXPORT_SYMBOL_GPL(rcu_all_qs);
|
||||||
|
@ -864,7 +855,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);
|
||||||
*/
|
*/
|
||||||
void rcu_note_context_switch(bool preempt)
|
void rcu_note_context_switch(bool preempt)
|
||||||
{
|
{
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking down. */
|
|
||||||
trace_rcu_utilization(TPS("Start context switch"));
|
trace_rcu_utilization(TPS("Start context switch"));
|
||||||
rcu_qs();
|
rcu_qs();
|
||||||
/* Load rcu_urgent_qs before other flags. */
|
/* Load rcu_urgent_qs before other flags. */
|
||||||
|
@ -877,7 +867,6 @@ void rcu_note_context_switch(bool preempt)
|
||||||
rcu_tasks_qs(current);
|
rcu_tasks_qs(current);
|
||||||
out:
|
out:
|
||||||
trace_rcu_utilization(TPS("End context switch"));
|
trace_rcu_utilization(TPS("End context switch"));
|
||||||
barrier(); /* Avoid RCU read-side critical sections leaking up. */
|
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
|
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue