From 215e262f2aeba378aa192da07c30770f9925a4bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 31 May 2013 15:26:45 -0700 Subject: [PATCH 1/8] percpu: implement generic percpu refcounting This implements a refcount with similar semantics to atomic_get()/atomic_dec_and_test() - but percpu. It also implements two stage shutdown, as we need it to tear down the percpu counts. Before dropping the initial refcount, you must call percpu_ref_kill(); this puts the refcount in "shutting down mode" and switches back to a single atomic refcount with the appropriate barriers (synchronize_rcu()). It's also legal to call percpu_ref_kill() multiple times - it only returns true once, so callers don't have to reimplement shutdown synchronization. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style tweak] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Tejun Heo Cc: Oleg Nesterov Cc: Christoph Lameter Cc: Ingo Molnar Reviewed-by: "Theodore Ts'o" Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 122 ++++++++++++++++++++++++++++++ lib/Makefile | 2 +- lib/percpu-refcount.c | 128 ++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 include/linux/percpu-refcount.h create mode 100644 lib/percpu-refcount.c diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..24b31ef15932 --- /dev/null +++ b/include/linux/percpu-refcount.h @@ -0,0 +1,122 @@ +/* + * Percpu refcounts: + * (C) 2012 Google, Inc. + * Author: Kent Overstreet + * + * This implements a refcount with similar semantics to atomic_t - atomic_inc(), + * atomic_dec_and_test() - but percpu. + * + * There's one important difference between percpu refs and normal atomic_t + * refcounts; you have to keep track of your initial refcount, and then when you + * start shutting down you call percpu_ref_kill() _before_ dropping the initial + * refcount. + * + * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less + * than an atomic_t - this is because of the way shutdown works, see + * percpu_ref_kill()/PCPU_COUNT_BIAS. + * + * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the + * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() + * puts the ref back in single atomic_t mode, collecting the per cpu refs and + * issuing the appropriate barriers, and then marks the ref as shutting down so + * that percpu_ref_put() will check for the ref hitting 0. After it returns, + * it's safe to drop the initial ref. + * + * USAGE: + * + * See fs/aio.c for some example usage; it's used there for struct kioctx, which + * is created when userspaces calls io_setup(), and destroyed when userspace + * calls io_destroy() or the process exits. + * + * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it + * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove + * the kioctx from the proccess's list of kioctxs - after that, there can't be + * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop + * the initial ref with percpu_ref_put(). + * + * Code that does a two stage shutdown like this often needs some kind of + * explicit synchronization to ensure the initial refcount can only be dropped + * once - percpu_ref_kill() does this for you, it returns true once and false if + * someone else already called it. The aio code uses it this way, but it's not + * necessary if the code has some other mechanism to synchronize teardown. + * around. + */ + +#ifndef _LINUX_PERCPU_REFCOUNT_H +#define _LINUX_PERCPU_REFCOUNT_H + +#include +#include +#include +#include + +struct percpu_ref; +typedef void (percpu_ref_release)(struct percpu_ref *); + +struct percpu_ref { + atomic_t count; + /* + * The low bit of the pointer indicates whether the ref is in percpu + * mode; if set, then get/put will manipulate the atomic_t (this is a + * hack because we need to keep the pointer around for + * percpu_ref_kill_rcu()) + */ + unsigned __percpu *pcpu_count; + percpu_ref_release *release; + struct rcu_head rcu; +}; + +int percpu_ref_init(struct percpu_ref *, percpu_ref_release *); +void percpu_ref_kill(struct percpu_ref *ref); + +#define PCPU_STATUS_BITS 2 +#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) +#define PCPU_REF_PTR 0 +#define PCPU_REF_DEAD 1 + +#define REF_STATUS(count) (((unsigned long) count) & PCPU_STATUS_MASK) + +/** + * percpu_ref_get - increment a percpu refcount + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get(struct percpu_ref *ref) +{ + unsigned __percpu *pcpu_count; + + preempt_disable(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) + __this_cpu_inc(*pcpu_count); + else + atomic_inc(&ref->count); + + preempt_enable(); +} + +/** + * percpu_ref_put - decrement a percpu refcount + * + * Decrement the refcount, and if 0, call the release function (which was passed + * to percpu_ref_init()) + */ +static inline void percpu_ref_put(struct percpu_ref *ref) +{ + unsigned __percpu *pcpu_count; + + preempt_disable(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) + __this_cpu_dec(*pcpu_count); + else if (unlikely(atomic_dec_and_test(&ref->count))) + ref->release(ref); + + preempt_enable(); +} + +#endif diff --git a/lib/Makefile b/lib/Makefile index c55a037a354e..386db4bbc265 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o + earlycpio.o percpu-refcount.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..6f0ffd702a09 --- /dev/null +++ b/lib/percpu-refcount.c @@ -0,0 +1,128 @@ +#define pr_fmt(fmt) "%s: " fmt "\n", __func__ + +#include +#include + +/* + * Initially, a percpu refcount is just a set of percpu counters. Initially, we + * don't try to detect the ref hitting 0 - which means that get/put can just + * increment or decrement the local counter. Note that the counter on a + * particular cpu can (and will) wrap - this is fine, when we go to shutdown the + * percpu counters will all sum to the correct value + * + * (More precisely: because moduler arithmatic is commutative the sum of all the + * pcpu_count vars will be equal to what it would have been if all the gets and + * puts were done to a single integer, even if some of the percpu integers + * overflow or underflow). + * + * The real trick to implementing percpu refcounts is shutdown. We can't detect + * the ref hitting 0 on every put - this would require global synchronization + * and defeat the whole purpose of using percpu refs. + * + * What we do is require the user to keep track of the initial refcount; we know + * the ref can't hit 0 before the user drops the initial ref, so as long as we + * convert to non percpu mode before the initial ref is dropped everything + * works. + * + * Converting to non percpu mode is done with some RCUish stuff in + * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t + * can't hit 0 before we've added up all the percpu refs. + */ + +#define PCPU_COUNT_BIAS (1U << 31) + +/** + * percpu_ref_init - initialize a percpu refcount + * @ref: ref to initialize + * @release: function which will be called when refcount hits 0 + * + * Initializes the refcount in single atomic counter mode with a refcount of 1; + * analagous to atomic_set(ref, 1). + * + * Note that @release must not sleep - it may potentially be called from RCU + * callback context by percpu_ref_kill(). + */ +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release) +{ + atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + + ref->pcpu_count = alloc_percpu(unsigned); + if (!ref->pcpu_count) + return -ENOMEM; + + ref->release = release; + return 0; +} + +static void percpu_ref_kill_rcu(struct rcu_head *rcu) +{ + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); + unsigned __percpu *pcpu_count; + unsigned count = 0; + int cpu; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + /* Mask out PCPU_REF_DEAD */ + pcpu_count = (unsigned __percpu *) + (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK); + + for_each_possible_cpu(cpu) + count += *per_cpu_ptr(pcpu_count, cpu); + + free_percpu(pcpu_count); + + pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); + + /* + * It's crucial that we sum the percpu counters _before_ adding the sum + * to &ref->count; since gets could be happening on one cpu while puts + * happen on another, adding a single cpu's count could cause + * @ref->count to hit 0 before we've got a consistent value - but the + * sum of all the counts will be consistent and correct. + * + * Subtracting the bias value then has to happen _after_ adding count to + * &ref->count; we need the bias value to prevent &ref->count from + * reaching 0 before we add the percpu counts. But doing it at the same + * time is equivalent and saves us atomic operations: + */ + + atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + + /* + * Now we're in single atomic_t mode with a consistent refcount, so it's + * safe to drop our initial ref: + */ + percpu_ref_put(ref); +} + +/** + * percpu_ref_kill - safely drop initial ref + * + * Must be used to drop the initial ref on a percpu refcount; must be called + * precisely once before shutdown. + * + * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the + * percpu counters and dropping the initial ref. + */ +void percpu_ref_kill(struct percpu_ref *ref) +{ + unsigned __percpu *pcpu_count, *old, *new; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + do { + if (REF_STATUS(pcpu_count) == PCPU_REF_DEAD) { + WARN(1, "percpu_ref_kill() called more than once!\n"); + return; + } + + old = pcpu_count; + new = (unsigned __percpu *) + (((unsigned long) pcpu_count)|PCPU_REF_DEAD); + + pcpu_count = cmpxchg(&ref->pcpu_count, old, new); + } while (pcpu_count != old); + + call_rcu(&ref->rcu, percpu_ref_kill_rcu); +} From c1ae6e9b4db00023b9caed72af49a93abad46452 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 3 Jun 2013 16:02:29 -0700 Subject: [PATCH 2/8] percpu-refcount: Don't use silly cmpxchg() The cmpxchg() was just to ensure the debug check didn't race, which was a bit excessive. The caller is supposed to do the appropriate synchronization, which means percpu_ref_kill() can just do a simple store. Signed-off-by: Kent Overstreet Signed-off-by: Tejun Heo --- lib/percpu-refcount.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 6f0ffd702a09..1a17399fc7db 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -107,22 +107,11 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) */ void percpu_ref_kill(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count, *old, *new; + WARN_ONCE(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD, + "percpu_ref_kill() called more than once!\n"); - pcpu_count = ACCESS_ONCE(ref->pcpu_count); - - do { - if (REF_STATUS(pcpu_count) == PCPU_REF_DEAD) { - WARN(1, "percpu_ref_kill() called more than once!\n"); - return; - } - - old = pcpu_count; - new = (unsigned __percpu *) - (((unsigned long) pcpu_count)|PCPU_REF_DEAD); - - pcpu_count = cmpxchg(&ref->pcpu_count, old, new); - } while (pcpu_count != old); + ref->pcpu_count = (unsigned __percpu *) + (((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD); call_rcu(&ref->rcu, percpu_ref_kill_rcu); } From 6a24474da83ea7c8b7d32f05f858b1259994067a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 20:43:06 -0700 Subject: [PATCH 3/8] percpu-refcount: consistently use plain (non-sched) RCU percpu_ref_get/put() are using preempt_disable/enable() while percpu_ref_kill() is using plain call_rcu() instead of call_rcu_sched(). This is buggy as grace periods of the two may not match. Fix it by using plain RCU in percpu_ref_get/put(). (I suggested using sched RCU in the first place but there's no actual benefit in doing so unless we're gonna introduce different variants of get/put to be called while preemption is alredy disabled, which we definitely shouldn't.) Signed-off-by: Tejun Heo Reported-by: Rusty Russell Acked-by: Kent Overstreet --- include/linux/percpu-refcount.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 24b31ef15932..abe141172d96 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -85,7 +85,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) { unsigned __percpu *pcpu_count; - preempt_disable(); + rcu_read_lock(); pcpu_count = ACCESS_ONCE(ref->pcpu_count); @@ -94,7 +94,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) else atomic_inc(&ref->count); - preempt_enable(); + rcu_read_unlock(); } /** @@ -107,7 +107,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) { unsigned __percpu *pcpu_count; - preempt_disable(); + rcu_read_lock(); pcpu_count = ACCESS_ONCE(ref->pcpu_count); @@ -116,7 +116,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) else if (unlikely(atomic_dec_and_test(&ref->count))) ref->release(ref); - preempt_enable(); + rcu_read_unlock(); } #endif From ac899061a93250c28562f05ad94d5c74603415bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 20:43:06 -0700 Subject: [PATCH 4/8] percpu-refcount: cosmetic updates * s/percpu_ref_release/percpu_ref_func_t/ as it's customary to have _t postfix for types and the type is gonna be used for a different type of callback too. * Add @ARG to function comments. * Drop unnecessary and unaligned indentation from percpu_ref_init() function comment. Signed-off-by: Tejun Heo Acked-by: Kent Overstreet --- include/linux/percpu-refcount.h | 8 +++++--- lib/percpu-refcount.c | 7 ++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index abe141172d96..b61bd6f23985 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -51,7 +51,7 @@ #include struct percpu_ref; -typedef void (percpu_ref_release)(struct percpu_ref *); +typedef void (percpu_ref_func_t)(struct percpu_ref *); struct percpu_ref { atomic_t count; @@ -62,11 +62,11 @@ struct percpu_ref { * percpu_ref_kill_rcu()) */ unsigned __percpu *pcpu_count; - percpu_ref_release *release; + percpu_ref_func_t *release; struct rcu_head rcu; }; -int percpu_ref_init(struct percpu_ref *, percpu_ref_release *); +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release); void percpu_ref_kill(struct percpu_ref *ref); #define PCPU_STATUS_BITS 2 @@ -78,6 +78,7 @@ void percpu_ref_kill(struct percpu_ref *ref); /** * percpu_ref_get - increment a percpu refcount + * @ref: percpu_ref to get * * Analagous to atomic_inc(). */ @@ -99,6 +100,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) /** * percpu_ref_put - decrement a percpu refcount + * @ref: percpu_ref to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 1a17399fc7db..9a78e55fa48f 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -33,8 +33,8 @@ /** * percpu_ref_init - initialize a percpu refcount - * @ref: ref to initialize - * @release: function which will be called when refcount hits 0 + * @ref: percpu_ref to initialize + * @release: function which will be called when refcount hits 0 * * Initializes the refcount in single atomic counter mode with a refcount of 1; * analagous to atomic_set(ref, 1). @@ -42,7 +42,7 @@ * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) { atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); @@ -98,6 +98,7 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) /** * percpu_ref_kill - safely drop initial ref + * @ref: percpu_ref to kill * * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. From acac7883ee7bcc32476963bce7baf73d44574dd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 20:52:01 -0700 Subject: [PATCH 5/8] percpu-refcount: add __must_check to percpu_ref_init() and don't use ACCESS_ONCE() in percpu_ref_kill_rcu() Two small changes. * Unlike most init functions, percpu_ref_init() allocates memory and may fail. Let's mark it with __must_check in case the caller forgets. * percpu_ref_kill_rcu() is unnecessarily using ACCESS_ONCE() to dereference @ref->pcpu_count, which can be misleading. The pointer is guaranteed to be valid and visible and can't change underneath the function. Drop ACCESS_ONCE(). Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 3 ++- lib/percpu-refcount.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b61bd6f23985..8146aa9cd89e 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -66,7 +66,8 @@ struct percpu_ref { struct rcu_head rcu; }; -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release); +int __must_check percpu_ref_init(struct percpu_ref *ref, + percpu_ref_func_t *release); void percpu_ref_kill(struct percpu_ref *ref); #define PCPU_STATUS_BITS 2 diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9a78e55fa48f..b35eaac2954f 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -57,12 +57,10 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned __percpu *pcpu_count; + unsigned __percpu *pcpu_count = ref->pcpu_count; unsigned count = 0; int cpu; - pcpu_count = ACCESS_ONCE(ref->pcpu_count); - /* Mask out PCPU_REF_DEAD */ pcpu_count = (unsigned __percpu *) (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK); From bc497bd33b2d6a6f07bc8574b4764edbd7fdffa8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 20:52:35 -0700 Subject: [PATCH 6/8] percpu-refcount: implement percpu_ref_cancel_init() Normally, percpu_ref_init() initializes and percpu_ref_kill() initiates destruction which completes asynchronously. The asynchronous destruction can be problematic in init failure path where the caller wants to destroy half-constructed object - distinguishing half-constructed objects from the usual release method can be painful for complex objects. This patch implements percpu_ref_cancel_init() which synchronously destroys the percpu_ref without invoking release. To avoid unintentional misuses, the function requires the ref to have finished percpu_ref_init() but never used and triggers WARN otherwise. v2: Explain the weird name and usage restriction in the function comment. Signed-off-by: Tejun Heo Acked-by: Kent Overstreet --- include/linux/percpu-refcount.h | 1 + lib/percpu-refcount.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 8146aa9cd89e..6d843d60690d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -68,6 +68,7 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release); +void percpu_ref_cancel_init(struct percpu_ref *ref); void percpu_ref_kill(struct percpu_ref *ref); #define PCPU_STATUS_BITS 2 diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index b35eaac2954f..ebeaac274cb9 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -54,6 +54,37 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) return 0; } +/** + * percpu_ref_cancel_init - cancel percpu_ref_init() + * @ref: percpu_ref to cancel init for + * + * Once a percpu_ref is initialized, its destruction is initiated by + * percpu_ref_kill() and completes asynchronously, which can be painful to + * do when destroying a half-constructed object in init failure path. + * + * This function destroys @ref without invoking @ref->release and the + * memory area containing it can be freed immediately on return. To + * prevent accidental misuse, it's required that @ref has finished + * percpu_ref_init(), whether successful or not, but never used. + * + * The weird name and usage restriction are to prevent people from using + * this function by mistake for normal shutdown instead of + * percpu_ref_kill(). + */ +void percpu_ref_cancel_init(struct percpu_ref *ref) +{ + unsigned __percpu *pcpu_count = ref->pcpu_count; + int cpu; + + WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS); + + if (pcpu_count) { + for_each_possible_cpu(cpu) + WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu)); + free_percpu(ref->pcpu_count); + } +} + static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); From dbece3a0f1ef0b19aff1cc6ed0942fec9ab98de1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:23:53 -0700 Subject: [PATCH 7/8] percpu-refcount: implement percpu_tryget() along with percpu_ref_kill_and_confirm() Implement percpu_tryget() which stops giving out references once the percpu_ref is visible as killed. Because the refcnt is per-cpu, different CPUs will start to see a refcnt as killed at different points in time and tryget() may continue to succeed on subset of cpus for a while after percpu_ref_kill() returns. For use cases where it's necessary to know when all CPUs start to see the refcnt as dead, percpu_ref_kill_and_confirm() is added. The new function takes an extra argument @confirm_kill which is invoked when the refcnt is guaranteed to be viewed as killed on all CPUs. While this isn't the prettiest interface, it doesn't force synchronous wait and is much safer than requiring the caller to do its own call_rcu(). v2: Patch description rephrased to emphasize that tryget() may continue to succeed on some CPUs after kill() returns as suggested by Kent. v3: Function comment in percpu_ref_kill_and_confirm() updated warning people to not depend on the implied RCU grace period from the confirm callback as it's an implementation detail. Signed-off-by: Tejun Heo Slightly-Grumpily-Acked-by: Kent Overstreet --- include/linux/percpu-refcount.h | 50 ++++++++++++++++++++++++++++++++- lib/percpu-refcount.c | 23 +++++++++++---- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 6d843d60690d..dd2a08600453 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -63,13 +63,30 @@ struct percpu_ref { */ unsigned __percpu *pcpu_count; percpu_ref_func_t *release; + percpu_ref_func_t *confirm_kill; struct rcu_head rcu; }; int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release); void percpu_ref_cancel_init(struct percpu_ref *ref); -void percpu_ref_kill(struct percpu_ref *ref); +void percpu_ref_kill_and_confirm(struct percpu_ref *ref, + percpu_ref_func_t *confirm_kill); + +/** + * percpu_ref_kill - drop the initial ref + * @ref: percpu_ref to kill + * + * Must be used to drop the initial ref on a percpu refcount; must be called + * precisely once before shutdown. + * + * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the + * percpu counters and dropping the initial ref. + */ +static inline void percpu_ref_kill(struct percpu_ref *ref) +{ + return percpu_ref_kill_and_confirm(ref, NULL); +} #define PCPU_STATUS_BITS 2 #define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) @@ -100,6 +117,37 @@ static inline void percpu_ref_get(struct percpu_ref *ref) rcu_read_unlock(); } +/** + * percpu_ref_tryget - try to increment a percpu refcount + * @ref: percpu_ref to try-get + * + * Increment a percpu refcount unless it has already been killed. Returns + * %true on success; %false on failure. + * + * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget + * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be + * used. After the confirm_kill callback is invoked, it's guaranteed that + * no new reference will be given out by percpu_ref_tryget(). + */ +static inline bool percpu_ref_tryget(struct percpu_ref *ref) +{ + unsigned __percpu *pcpu_count; + int ret = false; + + rcu_read_lock(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) { + __this_cpu_inc(*pcpu_count); + ret = true; + } + + rcu_read_unlock(); + + return ret; +} + /** * percpu_ref_put - decrement a percpu refcount * @ref: percpu_ref to put diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index ebeaac274cb9..8bf9e719cca0 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -118,6 +118,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + /* @ref is viewed as dead on all CPUs, send out kill confirmation */ + if (ref->confirm_kill) + ref->confirm_kill(ref); + /* * Now we're in single atomic_t mode with a consistent refcount, so it's * safe to drop our initial ref: @@ -126,22 +130,29 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) } /** - * percpu_ref_kill - safely drop initial ref + * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation * @ref: percpu_ref to kill + * @confirm_kill: optional confirmation callback * - * Must be used to drop the initial ref on a percpu refcount; must be called - * precisely once before shutdown. + * Equivalent to percpu_ref_kill() but also schedules kill confirmation if + * @confirm_kill is not NULL. @confirm_kill, which may not block, will be + * called after @ref is seen as dead from all CPUs - all further + * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() + * for more details. * - * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the - * percpu counters and dropping the initial ref. + * Due to the way percpu_ref is implemented, @confirm_kill will be called + * after at least one full RCU grace period has passed but this is an + * implementation detail and callers must not depend on it. */ -void percpu_ref_kill(struct percpu_ref *ref) +void percpu_ref_kill_and_confirm(struct percpu_ref *ref, + percpu_ref_func_t *confirm_kill) { WARN_ONCE(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD, "percpu_ref_kill() called more than once!\n"); ref->pcpu_count = (unsigned __percpu *) (((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD); + ref->confirm_kill = confirm_kill; call_rcu(&ref->rcu, percpu_ref_kill_rcu); } From a4244454df1296e90cc961c1b636b1176ef0d9a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Jun 2013 16:12:26 -0700 Subject: [PATCH 8/8] percpu-refcount: use RCU-sched insted of normal RCU percpu-refcount was incorrectly using preempt_disable/enable() for RCU critical sections against call_rcu(). 6a24474da8 ("percpu-refcount: consistently use plain (non-sched) RCU") fixed it by converting the preepmtion operations with rcu_read_[un]lock() citing that there isn't any advantage in using sched-RCU over using the usual one; however, rcu_read_[un]lock() for the preemptible RCU implementation - CONFIG_TREE_PREEMPT_RCU, chosen when CONFIG_PREEMPT - are slightly more expensive than preempt_disable/enable(). In a contrived microbench which repeats the followings, - percpu_ref_get() - copy 32 bytes of data into percpu buffer - percpu_put_get() - copy 32 bytes of data into percpu buffer rcu_read_[un]lock() used in percpu_ref_get/put() makes it go slower by about 15% when compared to using sched-RCU. As the RCU critical sections are extremely short, using sched-RCU shouldn't have any latency implications. Convert to RCU-sched. Signed-off-by: Tejun Heo Acked-by: Kent Overstreet Acked-by: "Paul E. McKenney" Cc: Michal Hocko Cc: Rusty Russell --- include/linux/percpu-refcount.h | 12 ++++++------ lib/percpu-refcount.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index dd2a08600453..95961f0bf62d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -105,7 +105,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) { unsigned __percpu *pcpu_count; - rcu_read_lock(); + rcu_read_lock_sched(); pcpu_count = ACCESS_ONCE(ref->pcpu_count); @@ -114,7 +114,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) else atomic_inc(&ref->count); - rcu_read_unlock(); + rcu_read_unlock_sched(); } /** @@ -134,7 +134,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) unsigned __percpu *pcpu_count; int ret = false; - rcu_read_lock(); + rcu_read_lock_sched(); pcpu_count = ACCESS_ONCE(ref->pcpu_count); @@ -143,7 +143,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) ret = true; } - rcu_read_unlock(); + rcu_read_unlock_sched(); return ret; } @@ -159,7 +159,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) { unsigned __percpu *pcpu_count; - rcu_read_lock(); + rcu_read_lock_sched(); pcpu_count = ACCESS_ONCE(ref->pcpu_count); @@ -168,7 +168,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) else if (unlikely(atomic_dec_and_test(&ref->count))) ref->release(ref); - rcu_read_unlock(); + rcu_read_unlock_sched(); } #endif diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8bf9e719cca0..7deeb6297a48 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -154,5 +154,5 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, (((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD); ref->confirm_kill = confirm_kill; - call_rcu(&ref->rcu, percpu_ref_kill_rcu); + call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); }