Merge branches 'doc.2019.10.29a', 'fixes.2019.10.30a', 'nohz.2019.10.28a', 'replace.2019.10.30a', 'torture.2019.10.05a' and 'lkmm.2019.10.05a' into HEAD

doc.2019.10.29a: RCU documentation updates.
fixes.2019.10.30a: RCU miscellaneous fixes.
nohz.2019.10.28a: RCU NO_HZ and NO_HZ_FULL updates.
replace.2019.10.30a: Replace rcu_swap_protected() with rcu_replace().
torture.2019.10.05a: RCU torture-test updates.

lkmm.2019.10.05a: Linux kernel memory model updates.
This commit is contained in:
Paul E. McKenney 2019-10-30 08:47:13 -07:00
50 changed files with 785 additions and 207 deletions

View File

@ -416,8 +416,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
*filter = tmp;
mutex_lock(&kvm->lock);
rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
mutex_unlock(&kvm->lock);
synchronize_srcu_expedited(&kvm->srcu);

View File

@ -1629,7 +1629,7 @@ set_engines(struct i915_gem_context *ctx,
i915_gem_context_set_user_engines(ctx);
else
i915_gem_context_clear_user_engines(ctx);
rcu_swap_protected(ctx->engines, set.engines, 1);
set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
mutex_unlock(&ctx->engines_mutex);
call_rcu(&set.engines->rcu, free_engines_rcu);

View File

@ -434,8 +434,8 @@ static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page,
return;
mutex_lock(&sdev->inquiry_mutex);
rcu_swap_protected(*sdev_vpd_buf, vpd_buf,
lockdep_is_held(&sdev->inquiry_mutex));
vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf,
lockdep_is_held(&sdev->inquiry_mutex));
mutex_unlock(&sdev->inquiry_mutex);
if (vpd_buf)

View File

@ -466,10 +466,10 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
sdev->request_queue = NULL;
mutex_lock(&sdev->inquiry_mutex);
rcu_swap_protected(sdev->vpd_pg80, vpd_pg80,
lockdep_is_held(&sdev->inquiry_mutex));
rcu_swap_protected(sdev->vpd_pg83, vpd_pg83,
lockdep_is_held(&sdev->inquiry_mutex));
vpd_pg80 = rcu_replace_pointer(sdev->vpd_pg80, vpd_pg80,
lockdep_is_held(&sdev->inquiry_mutex));
vpd_pg83 = rcu_replace_pointer(sdev->vpd_pg83, vpd_pg83,
lockdep_is_held(&sdev->inquiry_mutex));
mutex_unlock(&sdev->inquiry_mutex);
if (vpd_pg83)

View File

@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
struct afs_addr_list *old = addrs;
write_lock(&server->lock);
rcu_swap_protected(server->addresses, old,
lockdep_is_held(&server->lock));
old = rcu_replace_pointer(server->addresses, old,
lockdep_is_held(&server->lock));
write_unlock(&server->lock);
afs_put_addrlist(old);
}

View File

@ -24,34 +24,6 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}
/**
* hlist_bl_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list.
*
* Note: hlist_bl_unhashed() on the node returns true after this. It is
* useful for RCU based read lockfree traversal if the writer side
* must know if the list entry is still hashed or already unhashed.
*
* In particular, it means that we can not poison the forward pointers
* that may still be used for walking the hash list and we can only
* zero the pprev pointer so list_unhashed() will return true after
* this.
*
* The caller must take whatever precautions are necessary (such as
* holding appropriate locks) to avoid racing with another
* list-mutation primitive, such as hlist_bl_add_head_rcu() or
* hlist_bl_del_rcu(), running on this same list. However, it is
* perfectly legal to run concurrently with the _rcu list-traversal
* primitives, such as hlist_bl_for_each_entry_rcu().
*/
static inline void hlist_bl_del_init_rcu(struct hlist_bl_node *n)
{
if (!hlist_bl_unhashed(n)) {
__hlist_bl_del(n);
n->pprev = NULL;
}
}
/**
* hlist_bl_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.

View File

@ -382,6 +382,24 @@ do { \
smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)
/**
* rcu_replace_pointer() - replace an RCU pointer, returning its old value
* @rcu_ptr: RCU pointer, whose old value is returned
* @ptr: regular pointer
* @c: the lockdep conditions under which the dereference will take place
*
* Perform a replacement, where @rcu_ptr is an RCU-annotated
* pointer and @c is the lockdep argument that is passed to the
* rcu_dereference_protected() call used to read that pointer. The old
* value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
*/
#define rcu_replace_pointer(rcu_ptr, ptr, c) \
({ \
typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \
rcu_assign_pointer((rcu_ptr), (ptr)); \
__tmp; \
})
/**
* rcu_swap_protected() - swap an RCU and a regular pointer
* @rcu_ptr: RCU pointer

View File

@ -84,6 +84,7 @@ static inline void rcu_scheduler_starting(void) { }
#endif /* #else #ifndef CONFIG_SRCU */
static inline void rcu_end_inkernel_boot(void) { }
static inline bool rcu_is_watching(void) { return true; }
static inline void rcu_momentary_dyntick_idle(void) { }
/* Avoid RCU read-side critical sections leaking across. */
static inline void rcu_all_qs(void) { barrier(); }

View File

@ -37,6 +37,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier(void);
bool rcu_eqs_special_set(int cpu);
void rcu_momentary_dyntick_idle(void);
unsigned long get_state_synchronize_rcu(void);
void cond_synchronize_rcu(unsigned long oldstate);

View File

@ -108,7 +108,8 @@ enum tick_dep_bits {
TICK_DEP_BIT_POSIX_TIMER = 0,
TICK_DEP_BIT_PERF_EVENTS = 1,
TICK_DEP_BIT_SCHED = 2,
TICK_DEP_BIT_CLOCK_UNSTABLE = 3
TICK_DEP_BIT_CLOCK_UNSTABLE = 3,
TICK_DEP_BIT_RCU = 4
};
#define TICK_DEP_MASK_NONE 0
@ -116,6 +117,7 @@ enum tick_dep_bits {
#define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS)
#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED)
#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
#define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU)
#ifdef CONFIG_NO_HZ_COMMON
extern bool tick_nohz_enabled;
@ -268,6 +270,9 @@ static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
static inline void tick_dep_set(enum tick_dep_bits bit) { }
static inline void tick_dep_clear(enum tick_dep_bits bit) { }
static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }

View File

@ -93,16 +93,16 @@ TRACE_EVENT_RCU(rcu_grace_period,
* the data from the rcu_node structure, other than rcuname, which comes
* from the rcu_state structure, and event, which is one of the following:
*
* "Startleaf": Request a grace period based on leaf-node data.
* "Cleanup": Clean up rcu_node structure after previous GP.
* "CleanupMore": Clean up, and another GP is needed.
* "EndWait": Complete wait.
* "NoGPkthread": The RCU grace-period kthread has not yet started.
* "Prestarted": Someone beat us to the request
* "Startedleaf": Leaf node marked for future GP.
* "Startedleafroot": All nodes from leaf to root marked for future GP.
* "Startedroot": Requested a nocb grace period based on root-node data.
* "NoGPkthread": The RCU grace-period kthread has not yet started.
* "Startleaf": Request a grace period based on leaf-node data.
* "StartWait": Start waiting for the requested grace period.
* "EndWait": Complete wait.
* "Cleanup": Clean up rcu_node structure after previous GP.
* "CleanupMore": Clean up, and another GP is needed.
*/
TRACE_EVENT_RCU(rcu_future_grace_period,
@ -258,20 +258,27 @@ TRACE_EVENT_RCU(rcu_exp_funnel_lock,
* the number of the offloaded CPU are extracted. The third and final
* argument is a string as follows:
*
* "WakeEmpty": Wake rcuo kthread, first CB to empty list.
* "WakeEmptyIsDeferred": Wake rcuo kthread later, first CB to empty list.
* "WakeOvf": Wake rcuo kthread, CB list is huge.
* "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge.
* "WakeNot": Don't wake rcuo kthread.
* "WakeNotPoll": Don't wake rcuo kthread because it is polling.
* "DeferredWake": Carried out the "IsDeferred" wakeup.
* "Poll": Start of new polling cycle for rcu_nocb_poll.
* "Sleep": Sleep waiting for GP for !rcu_nocb_poll.
* "CBSleep": Sleep waiting for CBs for !rcu_nocb_poll.
* "WokeEmpty": rcuo kthread woke to find empty list.
* "WokeNonEmpty": rcuo kthread woke to find non-empty list.
* "WaitQueue": Enqueue partially done, timed wait for it to complete.
* "WokeQueue": Partial enqueue now complete.
* "AlreadyAwake": The to-be-awakened rcuo kthread is already awake.
* "Bypass": rcuo GP kthread sees non-empty ->nocb_bypass.
* "CBSleep": rcuo CB kthread sleeping waiting for CBs.
* "Check": rcuo GP kthread checking specified CPU for work.
* "DeferredWake": Timer expired or polled check, time to wake.
* "DoWake": The to-be-awakened rcuo kthread needs to be awakened.
* "EndSleep": Done waiting for GP for !rcu_nocb_poll.
* "FirstBQ": New CB to empty ->nocb_bypass (->cblist maybe non-empty).
* "FirstBQnoWake": FirstBQ plus rcuo kthread need not be awakened.
* "FirstBQwake": FirstBQ plus rcuo kthread must be awakened.
* "FirstQ": New CB to empty ->cblist (->nocb_bypass maybe non-empty).
* "NeedWaitGP": rcuo GP kthread must wait on a grace period.
* "Poll": Start of new polling cycle for rcu_nocb_poll.
* "Sleep": Sleep waiting for GP for !rcu_nocb_poll.
* "Timer": Deferred-wake timer expired.
* "WakeEmptyIsDeferred": Wake rcuo kthread later, first CB to empty list.
* "WakeEmpty": Wake rcuo kthread, first CB to empty list.
* "WakeNot": Don't wake rcuo kthread.
* "WakeNotPoll": Don't wake rcuo kthread because it is polling.
* "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge.
* "WokeEmpty": rcuo CB kthread woke to find empty list.
*/
TRACE_EVENT_RCU(rcu_nocb_wake,
@ -713,8 +720,6 @@ TRACE_EVENT_RCU(rcu_torture_read,
* "Begin": rcu_barrier() started.
* "EarlyExit": rcu_barrier() piggybacked, thus early exit.
* "Inc1": rcu_barrier() piggyback check counter incremented.
* "OfflineNoCB": rcu_barrier() found callback on never-online CPU
* "OnlineNoCB": rcu_barrier() found online no-CBs CPU.
* "OnlineQ": rcu_barrier() found online CPU with callbacks.
* "OnlineNQ": rcu_barrier() found online CPU, no callbacks.
* "IRQ": An rcu_barrier_callback() callback posted on remote CPU.

View File

@ -367,7 +367,8 @@ TRACE_EVENT(itimer_expire,
tick_dep_name(POSIX_TIMER) \
tick_dep_name(PERF_EVENTS) \
tick_dep_name(SCHED) \
tick_dep_name_end(CLOCK_UNSTABLE)
tick_dep_name(CLOCK_UNSTABLE) \
tick_dep_name_end(RCU)
#undef tick_dep_name
#undef tick_dep_mask_name

View File

@ -180,8 +180,8 @@ static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
struct bpf_prog_array *old_array)
{
rcu_swap_protected(cgrp->bpf.effective[type], old_array,
lockdep_is_held(&cgroup_mutex));
old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/

View File

@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h>
#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
@ -889,16 +888,16 @@ static int __init lock_torture_init(void)
cxt.nrealwriters_stress = 2 * num_online_cpus();
#ifdef CONFIG_DEBUG_MUTEXES
if (strncmp(torture_type, "mutex", 5) == 0)
if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
if (strncmp(torture_type, "rtmutex", 7) == 0)
if (str_has_prefix(torture_type, "rtmutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
if ((strncmp(torture_type, "spin", 4) == 0) ||
(strncmp(torture_type, "rw_lock", 7) == 0))
if ((str_has_prefix(torture_type, "spin")) ||
(str_has_prefix(torture_type, "rw_lock")))
cxt.debug_lock = true;
#endif

View File

@ -299,6 +299,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
for (i = 0; i < RCU_NUM_LVLS; i++)
levelspread[i] = INT_MIN;
if (rcu_fanout_exact) {
levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
@ -455,7 +457,6 @@ enum rcutorture_type {
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
void rcutorture_record_progress(unsigned long vernum);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
@ -468,7 +469,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*flags = 0;
*gp_seq = 0;
}
static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,

View File

@ -88,7 +88,7 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
}
/* Set the length of an rcu_segcblist structure. */
void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
atomic_long_set(&rsclp->len, v);
@ -104,7 +104,7 @@ void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
* This increase is fully ordered with respect to the callers accesses
* both before and after.
*/
void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
smp_mb__before_atomic(); /* Up to the caller! */
@ -134,7 +134,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
* with the actual number of callbacks on the structure. This exchange is
* fully ordered with respect to the callers accesses both before and after.
*/
long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
return atomic_long_xchg(&rsclp->len, v);

View File

@ -109,15 +109,6 @@ static unsigned long b_rcu_perf_writer_started;
static unsigned long b_rcu_perf_writer_finished;
static DEFINE_PER_CPU(atomic_t, n_async_inflight);
static int rcu_perf_writer_state;
#define RTWS_INIT 0
#define RTWS_ASYNC 1
#define RTWS_BARRIER 2
#define RTWS_EXP_SYNC 3
#define RTWS_SYNC 4
#define RTWS_IDLE 5
#define RTWS_STOPPING 6
#define MAX_MEAS 10000
#define MIN_MEAS 100
@ -404,25 +395,20 @@ rcu_perf_writer(void *arg)
if (!rhp)
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
rcu_perf_writer_state = RTWS_ASYNC;
atomic_inc(this_cpu_ptr(&n_async_inflight));
cur_ops->async(rhp, rcu_perf_async_cb);
rhp = NULL;
} else if (!kthread_should_stop()) {
rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
goto retry;
} else {
kfree(rhp); /* Because we are stopping. */
}
} else if (gp_exp) {
rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
rcu_perf_writer_state = RTWS_SYNC;
cur_ops->sync();
}
rcu_perf_writer_state = RTWS_IDLE;
t = ktime_get_mono_fast_ns();
*wdp = t - *wdp;
i_max = i;
@ -463,10 +449,8 @@ rcu_perf_writer(void *arg)
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
if (gp_async) {
rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
}
rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
return 0;

View File

@ -44,6 +44,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/oom.h>
#include <linux/tick.h>
#include "rcu.h"
@ -1363,15 +1364,15 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
if (!rcu_torture_one_read(&rand))
if (!rcu_torture_one_read(&rand) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
if (time_after(jiffies, lastsleep)) {
if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
schedule_timeout_interruptible(1);
lastsleep = jiffies + 10;
}
@ -1383,6 +1384,7 @@ rcu_torture_reader(void *arg)
del_timer_sync(&t);
destroy_timer_on_stack(&t);
}
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@ -1442,15 +1444,18 @@ rcu_torture_stats_print(void)
n_rcu_torture_barrier_error);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
n_rcu_torture_boost_failure != 0 ||
if (atomic_read(&n_rcu_torture_mberror) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed
WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@ -1729,10 +1734,10 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
// Real call_rcu() floods hit userspace, so emulate that.
if (need_resched() || (iter & 0xfff))
schedule();
} else {
// No userspace emulation: CB invocation throttles call_rcu()
cond_resched();
return;
}
// No userspace emulation: CB invocation throttles call_rcu()
cond_resched();
}
/*
@ -1759,6 +1764,11 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
kfree(rfcp);
freed++;
rcu_torture_fwd_prog_cond_resched(freed);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
}
return freed;
}
@ -1803,7 +1813,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
rcu_torture_fwd_prog_cond_resched(1);
cond_resched();
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
@ -1833,6 +1843,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
static void rcu_torture_fwd_prog_cr(void)
{
unsigned long cver;
unsigned long flags;
unsigned long gps;
int i;
long n_launders;
@ -1865,6 +1876,7 @@ static void rcu_torture_fwd_prog_cr(void)
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
rcu_launder_gp_seq_start = gps;
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@ -1891,6 +1903,11 @@ static void rcu_torture_fwd_prog_cr(void)
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@ -1911,6 +1928,7 @@ static void rcu_torture_fwd_prog_cr(void)
rcu_torture_fwd_cb_hist();
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}

View File

@ -364,7 +364,7 @@ bool rcu_eqs_special_set(int cpu)
*
* The caller must have disabled interrupts and must not be idle.
*/
static void __maybe_unused rcu_momentary_dyntick_idle(void)
void rcu_momentary_dyntick_idle(void)
{
int special;
@ -375,6 +375,7 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
/**
* rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
@ -496,7 +497,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
static int rcu_pending(void);
static int rcu_pending(int user);
/*
* Return the number of RCU GPs completed thus far for debug & stats.
@ -824,6 +825,11 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
rcu_cleanup_after_idle();
incby = 1;
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
rdp->rcu_forced_tick = true;
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
@ -885,6 +891,21 @@ void rcu_irq_enter_irqson(void)
local_irq_restore(flags);
}
/*
* If any sort of urgency was applied to the current CPU (for example,
* the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
* to get to a quiescent state, disable it.
*/
static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
{
WRITE_ONCE(rdp->rcu_urgent_qs, false);
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
rdp->rcu_forced_tick = false;
}
}
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
@ -1073,6 +1094,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
@ -1968,7 +1990,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
@ -1979,6 +2000,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
if (!offloaded)
needwake = rcu_accelerate_cbs(rnp, rdp);
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
if (needwake)
@ -2101,6 +2123,9 @@ int rcutree_dead_cpu(unsigned int cpu)
rcu_boost_kthread_setaffinity(rnp, -1);
/* Do any needed no-CB deferred wakeups from this CPU. */
do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@ -2151,6 +2176,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
debug_rcu_head_unqueue(rhp);
@ -2217,6 +2243,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Re-invoke RCU core processing if there are callbacks remaining. */
if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
/*
@ -2241,7 +2268,7 @@ void rcu_sched_clock_irq(int user)
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
rcu_flavor_sched_clock_irq(user);
if (rcu_pending())
if (rcu_pending(user))
invoke_rcu_core();
trace_rcu_utilization(TPS("End scheduler-tick"));
@ -2259,6 +2286,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
int cpu;
unsigned long flags;
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
@ -2283,8 +2311,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
if (f(per_cpu_ptr(&rcu_data, cpu)))
rdp = per_cpu_ptr(&rcu_data, cpu);
if (f(rdp)) {
mask |= bit;
rcu_disable_urgency_upon_qs(rdp);
}
}
}
if (mask != 0) {
@ -2312,7 +2343,7 @@ void rcu_force_quiescent_state(void)
rnp = __this_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret)
@ -2786,8 +2817,9 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
* CPU-local state are performed first. However, we must check for CPU
* stalls first, else we might not get a chance.
*/
static int rcu_pending(void)
static int rcu_pending(int user)
{
bool gp_in_progress;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
@ -2798,12 +2830,13 @@ static int rcu_pending(void)
if (rcu_nocb_need_deferred_wakeup(rdp))
return 1;
/* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
if (rcu_nohz_full_cpu())
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
gp_in_progress = rcu_gp_in_progress();
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
/* Does this CPU have callbacks ready to invoke? */
@ -2811,8 +2844,7 @@ static int rcu_pending(void)
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
if (!rcu_gp_in_progress() &&
rcu_segcblist_is_enabled(&rdp->cblist) &&
if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
!rcu_segcblist_is_offloaded(&rdp->cblist)) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
@ -2845,7 +2877,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
{
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
rcu_barrier_trace(TPS("LastCB"), -1,
rcu_state.barrier_sequence);
rcu_state.barrier_sequence);
complete(&rcu_state.barrier_completion);
} else {
rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
@ -2869,7 +2901,7 @@ static void rcu_barrier_func(void *unused)
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
rcu_barrier_trace(TPS("IRQNQ"), -1,
rcu_state.barrier_sequence);
rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
}
@ -2896,7 +2928,7 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
rcu_barrier_trace(TPS("EarlyExit"), -1,
rcu_state.barrier_sequence);
rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
@ -2928,11 +2960,11 @@ void rcu_barrier(void)
continue;
if (rcu_segcblist_n_cbs(&rdp->cblist)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
rcu_state.barrier_sequence);
rcu_state.barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
} else {
rcu_barrier_trace(TPS("OnlineNQ"), cpu,
rcu_state.barrier_sequence);
rcu_state.barrier_sequence);
}
}
put_online_cpus();
@ -3083,6 +3115,9 @@ int rcutree_online_cpu(unsigned int cpu)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
rcutree_affinity_setting(cpu, -1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@ -3103,6 +3138,9 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
// nohz_full CPUs need the tick for stop-machine to work quickly
tick_dep_set(TICK_DEP_BIT_RCU);
return 0;
}
@ -3148,6 +3186,7 @@ void rcu_cpu_starting(unsigned int cpu)
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {

View File

@ -181,6 +181,7 @@ struct rcu_data {
atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */

View File

@ -1946,7 +1946,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
int __maybe_unused cpu = my_rdp->cpu;
unsigned long cur_gp_seq;
unsigned long flags;
bool gotcbs;
bool gotcbs = false;
unsigned long j = jiffies;
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;

View File

@ -233,6 +233,7 @@ static int multi_cpu_stop(void *data)
*/
touch_nmi_watchdog();
}
rcu_momentary_dyntick_idle();
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);

View File

@ -172,6 +172,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
static bool check_tick_dependency(atomic_t *dep)
@ -198,6 +199,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
if (val & TICK_DEP_MASK_RCU) {
trace_tick_stop(0, TICK_DEP_MASK_RCU);
return true;
}
return false;
}
@ -324,6 +330,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
preempt_enable();
}
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
@ -331,6 +338,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
@ -344,11 +352,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
*/
tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
@ -397,6 +407,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{

View File

@ -364,11 +364,6 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex), \
"RCU or wq->mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex) && \
@ -425,9 +420,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* ignored.
*/
#define for_each_pwq(pwq, wq) \
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
else
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
lockdep_is_held(&(wq->mutex)))
#ifdef CONFIG_DEBUG_OBJECTS_WORK

View File

@ -1288,8 +1288,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
}
mutex_lock(&ifalias_mutex);
rcu_swap_protected(dev->ifalias, new_alias,
mutex_is_locked(&ifalias_mutex));
new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
mutex_is_locked(&ifalias_mutex));
mutex_unlock(&ifalias_mutex);
if (new_alias)

View File

@ -356,8 +356,8 @@ int reuseport_detach_prog(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
rcu_swap_protected(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
old_prog = rcu_replace_pointer(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);
if (!old_prog)

View File

@ -1461,8 +1461,9 @@ static void nft_chain_stats_replace(struct nft_trans *trans)
if (!nft_trans_chain_stats(trans))
return;
rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans),
lockdep_commit_lock_is_held(trans->ctx.net));
nft_trans_chain_stats(trans) =
rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans),
lockdep_commit_lock_is_held(trans->ctx.net));
if (!nft_trans_chain_stats(trans))
static_branch_inc(&nft_counters_enabled);

View File

@ -88,7 +88,7 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
struct tcf_chain *goto_chain)
{
a->tcfa_action = action;
rcu_swap_protected(a->goto_chain, goto_chain, 1);
goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1);
return goto_chain;
}
EXPORT_SYMBOL(tcf_action_set_ctrlact);

View File

@ -101,8 +101,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(p->params, params_new,
lockdep_is_held(&p->tcf_lock));
params_new = rcu_replace_pointer(p->params, params_new,
lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
if (goto_ch)

View File

@ -722,7 +722,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
params = rcu_replace_pointer(c->params, params,
lockdep_is_held(&c->tcf_lock));
spin_unlock_bh(&c->tcf_lock);
if (goto_ch)

View File

@ -257,8 +257,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
rcu_swap_protected(ci->params, cp_new,
lockdep_is_held(&ci->tcf_lock));
cp_new = rcu_replace_pointer(ci->params, cp_new,
lockdep_is_held(&ci->tcf_lock));
spin_unlock_bh(&ci->tcf_lock);
if (goto_ch)

View File

@ -594,7 +594,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ife->tcf_lock);
/* protected by tcf_lock when modifying existing action */
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(ife->params, p, 1);
p = rcu_replace_pointer(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);

View File

@ -178,8 +178,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
goto put_chain;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
rcu_swap_protected(m->tcfm_dev, dev,
lockdep_is_held(&m->tcf_lock));
dev = rcu_replace_pointer(m->tcfm_dev, dev,
lockdep_is_held(&m->tcf_lock));
if (dev)
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;

View File

@ -258,7 +258,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&m->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
spin_unlock_bh(&m->tcf_lock);
if (goto_ch)

View File

@ -191,9 +191,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
police->tcfp_ptoks = new->tcfp_mtu_ptoks;
spin_unlock_bh(&police->tcfp_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(police->params,
new,
lockdep_is_held(&police->tcf_lock));
new = rcu_replace_pointer(police->params,
new,
lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
if (goto_ch)

View File

@ -102,8 +102,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
s->rate = rate;
s->psample_group_num = psample_group_num;
rcu_swap_protected(s->psample_group, psample_group,
lockdep_is_held(&s->tcf_lock));
psample_group = rcu_replace_pointer(s->psample_group, psample_group,
lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;

View File

@ -206,8 +206,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(d->params, params_new,
lockdep_is_held(&d->tcf_lock));
params_new = rcu_replace_pointer(d->params, params_new,
lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);

View File

@ -381,8 +381,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(t->params, params_new,
lockdep_is_held(&t->tcf_lock));
params_new = rcu_replace_pointer(t->params, params_new,
lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
tunnel_key_release_params(params_new);
if (goto_ch)

View File

@ -220,7 +220,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
if (goto_ch)

View File

@ -179,8 +179,8 @@ static ssize_t handle_policy_update(struct file *file,
* doesn't currently exist, just use a spinlock for now.
*/
mutex_lock(&policy_update_lock);
rcu_swap_protected(safesetid_setuid_rules, pol,
lockdep_is_held(&policy_update_lock));
pol = rcu_replace_pointer(safesetid_setuid_rules, pol,
lockdep_is_held(&policy_update_lock));
mutex_unlock(&policy_update_lock);
err = len;

View File

@ -27,9 +27,10 @@ Explanation of the Linux-Kernel Memory Consistency Model
19. AND THEN THERE WAS ALPHA
20. THE HAPPENS-BEFORE RELATION: hb
21. THE PROPAGATES-BEFORE RELATION: pb
22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-fence, and rb
22. RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb
23. LOCKING
24. ODDS AND ENDS
24. PLAIN ACCESSES AND DATA RACES
25. ODDS AND ENDS
@ -42,8 +43,7 @@ linux-kernel.bell and linux-kernel.cat files that make up the formal
version of the model; they are extremely terse and their meanings are
far from clear.
This document describes the ideas underlying the LKMM, but excluding
the modeling of bare C (or plain) shared memory accesses. It is meant
This document describes the ideas underlying the LKMM. It is meant
for people who want to understand how the model was designed. It does
not go into the details of the code in the .bell and .cat files;
rather, it explains in English what the code expresses symbolically.
@ -206,7 +206,7 @@ goes like this:
P0 stores 1 to buf before storing 1 to flag, since it executes
its instructions in order.
Since an instruction (in this case, P1's store to flag) cannot
Since an instruction (in this case, P0's store to flag) cannot
execute before itself, the specified outcome is impossible.
However, real computer hardware almost never follows the Sequential
@ -419,7 +419,7 @@ example:
The object code might call f(5) either before or after g(6); the
memory model cannot assume there is a fixed program order relation
between them. (In fact, if the functions are inlined then the
between them. (In fact, if the function calls are inlined then the
compiler might even interleave their object code.)
@ -499,7 +499,7 @@ different CPUs (external reads-from, or rfe).
For our purposes, a memory location's initial value is treated as
though it had been written there by an imaginary initial store that
executes on a separate CPU before the program runs.
executes on a separate CPU before the main program runs.
Usage of the rf relation implicitly assumes that loads will always
read from a single store. It doesn't apply properly in the presence
@ -857,7 +857,7 @@ outlined above. These restrictions involve the necessity of
maintaining cache coherence and the fact that a CPU can't operate on a
value before it knows what that value is, among other things.
The formal version of the LKMM is defined by five requirements, or
The formal version of the LKMM is defined by six requirements, or
axioms:
Sequential consistency per variable: This requires that the
@ -877,10 +877,14 @@ axioms:
grace periods obey the rules of RCU, in particular, the
Grace-Period Guarantee.
Plain-coherence: This requires that plain memory accesses
(those not using READ_ONCE(), WRITE_ONCE(), etc.) must obey
the operational model's rules regarding cache coherence.
The first and second are quite common; they can be found in many
memory models (such as those for C11/C++11). The "happens-before" and
"propagation" axioms have analogs in other memory models as well. The
"rcu" axiom is specific to the LKMM.
"rcu" and "plain-coherence" axioms are specific to the LKMM.
Each of these axioms is discussed below.
@ -955,7 +959,7 @@ atomic update. This is what the LKMM's "atomic" axiom says.
THE PRESERVED PROGRAM ORDER RELATION: ppo
-----------------------------------------
There are many situations where a CPU is obligated to execute two
There are many situations where a CPU is obliged to execute two
instructions in program order. We amalgamate them into the ppo (for
"preserved program order") relation, which links the po-earlier
instruction to the po-later instruction and is thus a sub-relation of
@ -1425,8 +1429,8 @@ they execute means that it cannot have cycles. This requirement is
the content of the LKMM's "propagation" axiom.
RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-fence, and rb
-------------------------------------------------------------
RCU RELATIONS: rcu-link, rcu-gp, rcu-rscsi, rcu-order, rcu-fence, and rb
------------------------------------------------------------------------
RCU (Read-Copy-Update) is a powerful synchronization mechanism. It
rests on two concepts: grace periods and read-side critical sections.
@ -1536,29 +1540,29 @@ Z's CPU before Z begins but doesn't propagate to some other CPU until
after X ends.) Similarly, X ->rcu-rscsi Y ->rcu-link Z says that X is
the end of a critical section which starts before Z begins.
The LKMM goes on to define the rcu-fence relation as a sequence of
The LKMM goes on to define the rcu-order relation as a sequence of
rcu-gp and rcu-rscsi links separated by rcu-link links, in which the
number of rcu-gp links is >= the number of rcu-rscsi links. For
example:
X ->rcu-gp Y ->rcu-link Z ->rcu-rscsi T ->rcu-link U ->rcu-gp V
would imply that X ->rcu-fence V, because this sequence contains two
would imply that X ->rcu-order V, because this sequence contains two
rcu-gp links and one rcu-rscsi link. (It also implies that
X ->rcu-fence T and Z ->rcu-fence V.) On the other hand:
X ->rcu-order T and Z ->rcu-order V.) On the other hand:
X ->rcu-rscsi Y ->rcu-link Z ->rcu-rscsi T ->rcu-link U ->rcu-gp V
does not imply X ->rcu-fence V, because the sequence contains only
does not imply X ->rcu-order V, because the sequence contains only
one rcu-gp link but two rcu-rscsi links.
The rcu-fence relation is important because the Grace Period Guarantee
means that rcu-fence acts kind of like a strong fence. In particular,
E ->rcu-fence F implies not only that E begins before F ends, but also
that any write po-before E will propagate to every CPU before any
instruction po-after F can execute. (However, it does not imply that
E must execute before F; in fact, each synchronize_rcu() fence event
is linked to itself by rcu-fence as a degenerate case.)
The rcu-order relation is important because the Grace Period Guarantee
means that rcu-order links act kind of like strong fences. In
particular, E ->rcu-order F implies not only that E begins before F
ends, but also that any write po-before E will propagate to every CPU
before any instruction po-after F can execute. (However, it does not
imply that E must execute before F; in fact, each synchronize_rcu()
fence event is linked to itself by rcu-order as a degenerate case.)
To prove this in full generality requires some intellectual effort.
We'll consider just a very simple case:
@ -1572,7 +1576,7 @@ and there are events X, Y and a read-side critical section C such that:
2. X comes "before" Y in some sense (including rfe, co and fr);
2. Y is po-before Z;
3. Y is po-before Z;
4. Z is the rcu_read_unlock() event marking the end of C;
@ -1585,7 +1589,26 @@ G's CPU before G starts must propagate to every CPU before C starts.
In particular, the write propagates to every CPU before F finishes
executing and hence before any instruction po-after F can execute.
This sort of reasoning can be extended to handle all the situations
covered by rcu-fence.
covered by rcu-order.
The rcu-fence relation is a simple extension of rcu-order. While
rcu-order only links certain fence events (calls to synchronize_rcu(),
rcu_read_lock(), or rcu_read_unlock()), rcu-fence links any events
that are separated by an rcu-order link. This is analogous to the way
the strong-fence relation links events that are separated by an
smp_mb() fence event (as mentioned above, rcu-order links act kind of
like strong fences). Written symbolically, X ->rcu-fence Y means
there are fence events E and F such that:
X ->po E ->rcu-order F ->po Y.
From the discussion above, we see this implies not only that X
executes before Y, but also (if X is a store) that X propagates to
every CPU before Y executes. Thus rcu-fence is sort of a
"super-strong" fence: Unlike the original strong fences (smp_mb() and
synchronize_rcu()), rcu-fence is able to link events on different
CPUs. (Perhaps this fact should lead us to say that rcu-fence isn't
really a fence at all!)
Finally, the LKMM defines the RCU-before (rb) relation in terms of
rcu-fence. This is done in essentially the same way as the pb
@ -1596,7 +1619,7 @@ before F, just as E ->pb F does (and for much the same reasons).
Putting this all together, the LKMM expresses the Grace Period
Guarantee by requiring that the rb relation does not contain a cycle.
Equivalently, this "rcu" axiom requires that there are no events E
and F with E ->rcu-link F ->rcu-fence E. Or to put it a third way,
and F with E ->rcu-link F ->rcu-order E. Or to put it a third way,
the axiom requires that there are no cycles consisting of rcu-gp and
rcu-rscsi alternating with rcu-link, where the number of rcu-gp links
is >= the number of rcu-rscsi links.
@ -1750,7 +1773,7 @@ addition to normal RCU. The ideas involved are much the same as
above, with new relations srcu-gp and srcu-rscsi added to represent
SRCU grace periods and read-side critical sections. There is a
restriction on the srcu-gp and srcu-rscsi links that can appear in an
rcu-fence sequence (the srcu-rscsi links must be paired with srcu-gp
rcu-order sequence (the srcu-rscsi links must be paired with srcu-gp
links having the same SRCU domain with proper nesting); the details
are relatively unimportant.
@ -1896,6 +1919,521 @@ architectures supported by the Linux kernel, albeit for various
differing reasons.
PLAIN ACCESSES AND DATA RACES
-----------------------------
In the LKMM, memory accesses such as READ_ONCE(x), atomic_inc(&y),
smp_load_acquire(&z), and so on are collectively referred to as
"marked" accesses, because they are all annotated with special
operations of one kind or another. Ordinary C-language memory
accesses such as x or y = 0 are simply called "plain" accesses.
Early versions of the LKMM had nothing to say about plain accesses.
The C standard allows compilers to assume that the variables affected
by plain accesses are not concurrently read or written by any other
threads or CPUs. This leaves compilers free to implement all manner
of transformations or optimizations of code containing plain accesses,
making such code very difficult for a memory model to handle.
Here is just one example of a possible pitfall:
int a = 6;
int *x = &a;
P0()
{
int *r1;
int r2 = 0;
r1 = x;
if (r1 != NULL)
r2 = READ_ONCE(*r1);
}
P1()
{
WRITE_ONCE(x, NULL);
}
On the face of it, one would expect that when this code runs, the only
possible final values for r2 are 6 and 0, depending on whether or not
P1's store to x propagates to P0 before P0's load from x executes.
But since P0's load from x is a plain access, the compiler may decide
to carry out the load twice (for the comparison against NULL, then again
for the READ_ONCE()) and eliminate the temporary variable r1. The
object code generated for P0 could therefore end up looking rather
like this:
P0()
{
int r2 = 0;
if (x != NULL)
r2 = READ_ONCE(*x);
}
And now it is obvious that this code runs the risk of dereferencing a
NULL pointer, because P1's store to x might propagate to P0 after the
test against NULL has been made but before the READ_ONCE() executes.
If the original code had said "r1 = READ_ONCE(x)" instead of "r1 = x",
the compiler would not have performed this optimization and there
would be no possibility of a NULL-pointer dereference.
Given the possibility of transformations like this one, the LKMM
doesn't try to predict all possible outcomes of code containing plain
accesses. It is instead content to determine whether the code
violates the compiler's assumptions, which would render the ultimate
outcome undefined.
In technical terms, the compiler is allowed to assume that when the
program executes, there will not be any data races. A "data race"
occurs when two conflicting memory accesses execute concurrently;
two memory accesses "conflict" if:
they access the same location,
they occur on different CPUs (or in different threads on the
same CPU),
at least one of them is a plain access,
and at least one of them is a store.
The LKMM tries to determine whether a program contains two conflicting
accesses which may execute concurrently; if it does then the LKMM says
there is a potential data race and makes no predictions about the
program's outcome.
Determining whether two accesses conflict is easy; you can see that
all the concepts involved in the definition above are already part of
the memory model. The hard part is telling whether they may execute
concurrently. The LKMM takes a conservative attitude, assuming that
accesses may be concurrent unless it can prove they cannot.
If two memory accesses aren't concurrent then one must execute before
the other. Therefore the LKMM decides two accesses aren't concurrent
if they can be connected by a sequence of hb, pb, and rb links
(together referred to as xb, for "executes before"). However, there
are two complicating factors.
If X is a load and X executes before a store Y, then indeed there is
no danger of X and Y being concurrent. After all, Y can't have any
effect on the value obtained by X until the memory subsystem has
propagated Y from its own CPU to X's CPU, which won't happen until
some time after Y executes and thus after X executes. But if X is a
store, then even if X executes before Y it is still possible that X
will propagate to Y's CPU just as Y is executing. In such a case X
could very well interfere somehow with Y, and we would have to
consider X and Y to be concurrent.
Therefore when X is a store, for X and Y to be non-concurrent the LKMM
requires not only that X must execute before Y but also that X must
propagate to Y's CPU before Y executes. (Or vice versa, of course, if
Y executes before X -- then Y must propagate to X's CPU before X
executes if Y is a store.) This is expressed by the visibility
relation (vis), where X ->vis Y is defined to hold if there is an
intermediate event Z such that:
X is connected to Z by a possibly empty sequence of
cumul-fence links followed by an optional rfe link (if none of
these links are present, X and Z are the same event),
and either:
Z is connected to Y by a strong-fence link followed by a
possibly empty sequence of xb links,
or:
Z is on the same CPU as Y and is connected to Y by a possibly
empty sequence of xb links (again, if the sequence is empty it
means Z and Y are the same event).
The motivations behind this definition are straightforward:
cumul-fence memory barriers force stores that are po-before
the barrier to propagate to other CPUs before stores that are
po-after the barrier.
An rfe link from an event W to an event R says that R reads
from W, which certainly means that W must have propagated to
R's CPU before R executed.
strong-fence memory barriers force stores that are po-before
the barrier, or that propagate to the barrier's CPU before the
barrier executes, to propagate to all CPUs before any events
po-after the barrier can execute.
To see how this works out in practice, consider our old friend, the MP
pattern (with fences and statement labels, but without the conditional
test):
int buf = 0, flag = 0;
P0()
{
X: WRITE_ONCE(buf, 1);
smp_wmb();
W: WRITE_ONCE(flag, 1);
}
P1()
{
int r1;
int r2 = 0;
Z: r1 = READ_ONCE(flag);
smp_rmb();
Y: r2 = READ_ONCE(buf);
}
The smp_wmb() memory barrier gives a cumul-fence link from X to W, and
assuming r1 = 1 at the end, there is an rfe link from W to Z. This
means that the store to buf must propagate from P0 to P1 before Z
executes. Next, Z and Y are on the same CPU and the smp_rmb() fence
provides an xb link from Z to Y (i.e., it forces Z to execute before
Y). Therefore we have X ->vis Y: X must propagate to Y's CPU before Y
executes.
The second complicating factor mentioned above arises from the fact
that when we are considering data races, some of the memory accesses
are plain. Now, although we have not said so explicitly, up to this
point most of the relations defined by the LKMM (ppo, hb, prop,
cumul-fence, pb, and so on -- including vis) apply only to marked
accesses.
There are good reasons for this restriction. The compiler is not
allowed to apply fancy transformations to marked accesses, and
consequently each such access in the source code corresponds more or
less directly to a single machine instruction in the object code. But
plain accesses are a different story; the compiler may combine them,
split them up, duplicate them, eliminate them, invent new ones, and
who knows what else. Seeing a plain access in the source code tells
you almost nothing about what machine instructions will end up in the
object code.
Fortunately, the compiler isn't completely free; it is subject to some
limitations. For one, it is not allowed to introduce a data race into
the object code if the source code does not already contain a data
race (if it could, memory models would be useless and no multithreaded
code would be safe!). For another, it cannot move a plain access past
a compiler barrier.
A compiler barrier is a kind of fence, but as the name implies, it
only affects the compiler; it does not necessarily have any effect on
how instructions are executed by the CPU. In Linux kernel source
code, the barrier() function is a compiler barrier. It doesn't give
rise directly to any machine instructions in the object code; rather,
it affects how the compiler generates the rest of the object code.
Given source code like this:
... some memory accesses ...
barrier();
... some other memory accesses ...
the barrier() function ensures that the machine instructions
corresponding to the first group of accesses will all end po-before
any machine instructions corresponding to the second group of accesses
-- even if some of the accesses are plain. (Of course, the CPU may
then execute some of those accesses out of program order, but we
already know how to deal with such issues.) Without the barrier()
there would be no such guarantee; the two groups of accesses could be
intermingled or even reversed in the object code.
The LKMM doesn't say much about the barrier() function, but it does
require that all fences are also compiler barriers. In addition, it
requires that the ordering properties of memory barriers such as
smp_rmb() or smp_store_release() apply to plain accesses as well as to
marked accesses.
This is the key to analyzing data races. Consider the MP pattern
again, now using plain accesses for buf:
int buf = 0, flag = 0;
P0()
{
U: buf = 1;
smp_wmb();
X: WRITE_ONCE(flag, 1);
}
P1()
{
int r1;
int r2 = 0;
Y: r1 = READ_ONCE(flag);
if (r1) {
smp_rmb();
V: r2 = buf;
}
}
This program does not contain a data race. Although the U and V
accesses conflict, the LKMM can prove they are not concurrent as
follows:
The smp_wmb() fence in P0 is both a compiler barrier and a
cumul-fence. It guarantees that no matter what hash of
machine instructions the compiler generates for the plain
access U, all those instructions will be po-before the fence.
Consequently U's store to buf, no matter how it is carried out
at the machine level, must propagate to P1 before X's store to
flag does.
X and Y are both marked accesses. Hence an rfe link from X to
Y is a valid indicator that X propagated to P1 before Y
executed, i.e., X ->vis Y. (And if there is no rfe link then
r1 will be 0, so V will not be executed and ipso facto won't
race with U.)
The smp_rmb() fence in P1 is a compiler barrier as well as a
fence. It guarantees that all the machine-level instructions
corresponding to the access V will be po-after the fence, and
therefore any loads among those instructions will execute
after the fence does and hence after Y does.
Thus U's store to buf is forced to propagate to P1 before V's load
executes (assuming V does execute), ruling out the possibility of a
data race between them.
This analysis illustrates how the LKMM deals with plain accesses in
general. Suppose R is a plain load and we want to show that R
executes before some marked access E. We can do this by finding a
marked access X such that R and X are ordered by a suitable fence and
X ->xb* E. If E was also a plain access, we would also look for a
marked access Y such that X ->xb* Y, and Y and E are ordered by a
fence. We describe this arrangement by saying that R is
"post-bounded" by X and E is "pre-bounded" by Y.
In fact, we go one step further: Since R is a read, we say that R is
"r-post-bounded" by X. Similarly, E would be "r-pre-bounded" or
"w-pre-bounded" by Y, depending on whether E was a store or a load.
This distinction is needed because some fences affect only loads
(i.e., smp_rmb()) and some affect only stores (smp_wmb()); otherwise
the two types of bounds are the same. And as a degenerate case, we
say that a marked access pre-bounds and post-bounds itself (e.g., if R
above were a marked load then X could simply be taken to be R itself.)
The need to distinguish between r- and w-bounding raises yet another
issue. When the source code contains a plain store, the compiler is
allowed to put plain loads of the same location into the object code.
For example, given the source code:
x = 1;
the compiler is theoretically allowed to generate object code that
looks like:
if (x != 1)
x = 1;
thereby adding a load (and possibly replacing the store entirely).
For this reason, whenever the LKMM requires a plain store to be
w-pre-bounded or w-post-bounded by a marked access, it also requires
the store to be r-pre-bounded or r-post-bounded, so as to handle cases
where the compiler adds a load.
(This may be overly cautious. We don't know of any examples where a
compiler has augmented a store with a load in this fashion, and the
Linux kernel developers would probably fight pretty hard to change a
compiler if it ever did this. Still, better safe than sorry.)
Incidentally, the other tranformation -- augmenting a plain load by
adding in a store to the same location -- is not allowed. This is
because the compiler cannot know whether any other CPUs might perform
a concurrent load from that location. Two concurrent loads don't
constitute a race (they can't interfere with each other), but a store
does race with a concurrent load. Thus adding a store might create a
data race where one was not already present in the source code,
something the compiler is forbidden to do. Augmenting a store with a
load, on the other hand, is acceptable because doing so won't create a
data race unless one already existed.
The LKMM includes a second way to pre-bound plain accesses, in
addition to fences: an address dependency from a marked load. That
is, in the sequence:
p = READ_ONCE(ptr);
r = *p;
the LKMM says that the marked load of ptr pre-bounds the plain load of
*p; the marked load must execute before any of the machine
instructions corresponding to the plain load. This is a reasonable
stipulation, since after all, the CPU can't perform the load of *p
until it knows what value p will hold. Furthermore, without some
assumption like this one, some usages typical of RCU would count as
data races. For example:
int a = 1, b;
int *ptr = &a;
P0()
{
b = 2;
rcu_assign_pointer(ptr, &b);
}
P1()
{
int *p;
int r;
rcu_read_lock();
p = rcu_dereference(ptr);
r = *p;
rcu_read_unlock();
}
(In this example the rcu_read_lock() and rcu_read_unlock() calls don't
really do anything, because there aren't any grace periods. They are
included merely for the sake of good form; typically P0 would call
synchronize_rcu() somewhere after the rcu_assign_pointer().)
rcu_assign_pointer() performs a store-release, so the plain store to b
is definitely w-post-bounded before the store to ptr, and the two
stores will propagate to P1 in that order. However, rcu_dereference()
is only equivalent to READ_ONCE(). While it is a marked access, it is
not a fence or compiler barrier. Hence the only guarantee we have
that the load of ptr in P1 is r-pre-bounded before the load of *p
(thus avoiding a race) is the assumption about address dependencies.
This is a situation where the compiler can undermine the memory model,
and a certain amount of care is required when programming constructs
like this one. In particular, comparisons between the pointer and
other known addresses can cause trouble. If you have something like:
p = rcu_dereference(ptr);
if (p == &x)
r = *p;
then the compiler just might generate object code resembling:
p = rcu_dereference(ptr);
if (p == &x)
r = x;
or even:
rtemp = x;
p = rcu_dereference(ptr);
if (p == &x)
r = rtemp;
which would invalidate the memory model's assumption, since the CPU
could now perform the load of x before the load of ptr (there might be
a control dependency but no address dependency at the machine level).
Finally, it turns out there is a situation in which a plain write does
not need to be w-post-bounded: when it is separated from the
conflicting access by a fence. At first glance this may seem
impossible. After all, to be conflicting the second access has to be
on a different CPU from the first, and fences don't link events on
different CPUs. Well, normal fences don't -- but rcu-fence can!
Here's an example:
int x, y;
P0()
{
WRITE_ONCE(x, 1);
synchronize_rcu();
y = 3;
}
P1()
{
rcu_read_lock();
if (READ_ONCE(x) == 0)
y = 2;
rcu_read_unlock();
}
Do the plain stores to y race? Clearly not if P1 reads a non-zero
value for x, so let's assume the READ_ONCE(x) does obtain 0. This
means that the read-side critical section in P1 must finish executing
before the grace period in P0 does, because RCU's Grace-Period
Guarantee says that otherwise P0's store to x would have propagated to
P1 before the critical section started and so would have been visible
to the READ_ONCE(). (Another way of putting it is that the fre link
from the READ_ONCE() to the WRITE_ONCE() gives rise to an rcu-link
between those two events.)
This means there is an rcu-fence link from P1's "y = 2" store to P0's
"y = 3" store, and consequently the first must propagate from P1 to P0
before the second can execute. Therefore the two stores cannot be
concurrent and there is no race, even though P1's plain store to y
isn't w-post-bounded by any marked accesses.
Putting all this material together yields the following picture. For
two conflicting stores W and W', where W ->co W', the LKMM says the
stores don't race if W can be linked to W' by a
w-post-bounded ; vis ; w-pre-bounded
sequence. If W is plain then they also have to be linked by an
r-post-bounded ; xb* ; w-pre-bounded
sequence, and if W' is plain then they also have to be linked by a
w-post-bounded ; vis ; r-pre-bounded
sequence. For a conflicting load R and store W, the LKMM says the two
accesses don't race if R can be linked to W by an
r-post-bounded ; xb* ; w-pre-bounded
sequence or if W can be linked to R by a
w-post-bounded ; vis ; r-pre-bounded
sequence. For the cases involving a vis link, the LKMM also accepts
sequences in which W is linked to W' or R by a
strong-fence ; xb* ; {w and/or r}-pre-bounded
sequence with no post-bounding, and in every case the LKMM also allows
the link simply to be a fence with no bounding at all. If no sequence
of the appropriate sort exists, the LKMM says that the accesses race.
There is one more part of the LKMM related to plain accesses (although
not to data races) we should discuss. Recall that many relations such
as hb are limited to marked accesses only. As a result, the
happens-before, propagates-before, and rcu axioms (which state that
various relation must not contain a cycle) doesn't apply to plain
accesses. Nevertheless, we do want to rule out such cycles, because
they don't make sense even for plain accesses.
To this end, the LKMM imposes three extra restrictions, together
called the "plain-coherence" axiom because of their resemblance to the
rules used by the operational model to ensure cache coherence (that
is, the rules governing the memory subsystem's choice of a store to
satisfy a load request and its determination of where a store will
fall in the coherence order):
If R and W conflict and it is possible to link R to W by one
of the xb* sequences listed above, then W ->rfe R is not
allowed (i.e., a load cannot read from a store that it
executes before, even if one or both is plain).
If W and R conflict and it is possible to link W to R by one
of the vis sequences listed above, then R ->fre W is not
allowed (i.e., if a store is visible to a load then the load
must read from that store or one coherence-after it).
If W and W' conflict and it is possible to link W to W' by one
of the vis sequences listed above, then W' ->co W is not
allowed (i.e., if one store is visible to a second then the
second must come after the first in the coherence order).
This is the extent to which the LKMM deals with plain accesses.
Perhaps it could say more (for example, plain accesses might
contribute to the ppo relation), but at the moment it seems that this
minimal, conservative approach is good enough.
ODDS AND ENDS
-------------
@ -1943,6 +2481,16 @@ treated as READ_ONCE() and rcu_assign_pointer() is treated as
smp_store_release() -- which is basically how the Linux kernel treats
them.
Although we said that plain accesses are not linked by the ppo
relation, they do contribute to it indirectly. Namely, when there is
an address dependency from a marked load R to a plain store W,
followed by smp_wmb() and then a marked store W', the LKMM creates a
ppo link from R to W'. The reasoning behind this is perhaps a little
shaky, but essentially it says there is no way to generate object code
for this source code in which W' could execute before R. Just as with
pre-bounding by address dependencies, it is possible for the compiler
to undermine this relation if sufficient care is not taken.
There are a few oddball fences which need special treatment:
smp_mb__before_atomic(), smp_mb__after_atomic(), and
smp_mb__after_spinlock(). The LKMM uses fence events with special

View File

@ -197,7 +197,7 @@ empty (wr-incoh | rw-incoh | ww-incoh) as plain-coherence
(* Actual races *)
let ww-nonrace = ww-vis & ((Marked * W) | rw-xbstar) & ((W * Marked) | wr-vis)
let ww-race = (pre-race & co) \ ww-nonrace
let wr-race = (pre-race & (co? ; rf)) \ wr-vis
let wr-race = (pre-race & (co? ; rf)) \ wr-vis \ rw-xbstar^-1
let rw-race = (pre-race & fr) \ rw-xbstar
flag ~empty (ww-race | wr-race | rw-race) as data-race

View File

@ -1,8 +1,5 @@
CONFIG_SMP=y
CONFIG_NR_CPUS=2
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y

View File

@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=3
CONFIG_RCU_FANOUT_LEAF=3
CONFIG_RCU_NOCB_CPU=n

View File

@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=y
CONFIG_RCU_FAST_NO_HZ=y
CONFIG_RCU_TRACE=y
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=4
CONFIG_RCU_FANOUT_LEAF=3
CONFIG_DEBUG_LOCK_ALLOC=n

View File

@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=6
CONFIG_RCU_FANOUT_LEAF=6
CONFIG_RCU_NOCB_CPU=n

View File

@ -9,9 +9,6 @@ CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=3
CONFIG_RCU_FANOUT_LEAF=2
CONFIG_RCU_NOCB_CPU=y

View File

@ -8,9 +8,6 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_RCU_BOOST=n

View File

@ -6,9 +6,6 @@ CONFIG_PREEMPT=n
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y

View File

@ -6,7 +6,6 @@ Kconfig Parameters:
CONFIG_DEBUG_LOCK_ALLOC -- Do three, covering CONFIG_PROVE_LOCKING & not.
CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
CONFIG_HOTPLUG_CPU -- Do half. (Every second.)
CONFIG_HZ_PERIODIC -- Do one.
CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.