From da521626ac620d8719d674a48b8ec3620eefd42a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Aug 2021 09:20:04 -0600 Subject: [PATCH 1/8] bio: optimize initialization of a bio The memset() used is measurably slower in targeted benchmarks, wasting about 1% of the total runtime, or 50% of the (later) hot path cached bio alloc. Get rid of it and fill in the bio manually. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..6fa5c653283b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -246,12 +246,40 @@ static void bio_free(struct bio *bio) void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs) { - memset(bio, 0, sizeof(*bio)); + bio->bi_next = NULL; + bio->bi_bdev = NULL; + bio->bi_opf = 0; + bio->bi_flags = 0; + bio->bi_ioprio = 0; + bio->bi_write_hint = 0; + bio->bi_status = 0; + bio->bi_iter.bi_sector = 0; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_end_io = NULL; + bio->bi_private = NULL; +#ifdef CONFIG_BLK_CGROUP + bio->bi_blkg = NULL; + bio->bi_issue.value = 0; +#ifdef CONFIG_BLK_CGROUP_IOCOST + bio->bi_iocost_cost = 0; +#endif +#endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + bio->bi_crypt_context = NULL; +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + bio->bi_integrity = NULL; +#endif + bio->bi_vcnt = 0; + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); - bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + bio->bi_io_vec = table; + bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); From 6c7ef543df909dbdcd8cb24ef30627cba62a4e91 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Aug 2021 09:29:55 -0600 Subject: [PATCH 2/8] fs: add kiocb alloc cache flag If this kiocb can safely use the polled bio allocation cache, then this flag must be set. Generally this can be set for polled IO, where we will not see IRQ completions of the request. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..0dcc5de779c9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -319,6 +319,8 @@ enum rw_hint { /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) +/* can use bio alloc cache */ +#define IOCB_ALLOC_CACHE (1 << 21) struct kiocb { struct file *ki_filp; From be4d234d7aebbfe0c233bc20b9cdef7ab3408ff4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Mar 2021 11:37:47 -0700 Subject: [PATCH 3/8] bio: add allocation cache abstraction Add a per-cpu bio_set cache for bio allocations, enabling us to quickly recycle them instead of going through the slab allocator. This cache isn't IRQ safe, and hence is only really suitable for polled IO. Very simple - keeps a count of bio's in the cache, and maintains a max of 512 with a slack of 64. If we get above max + slack, we drop slack number of bio's. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 134 +++++++++++++++++++++++++++++++++---- include/linux/bio.h | 13 ++++ include/linux/blk_types.h | 1 + include/linux/cpuhotplug.h | 1 + 4 files changed, 135 insertions(+), 14 deletions(-) diff --git a/block/bio.c b/block/bio.c index 6fa5c653283b..dbb0bc8e1ef7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,6 +25,11 @@ #include "blk.h" #include "blk-rq-qos.h" +struct bio_alloc_cache { + struct bio_list free_list; + unsigned int nr; +}; + static struct biovec_slab { int nr_vecs; char *name; @@ -619,6 +624,53 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } +#define ALLOC_CACHE_MAX 512 +#define ALLOC_CACHE_SLACK 64 + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + unsigned int i = 0; + struct bio *bio; + + while ((bio = bio_list_pop(&cache->free_list)) != NULL) { + cache->nr--; + bio_free(bio); + if (++i == nr) + break; + } +} + +static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct bio_set *bs; + + bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); + if (bs->cache) { + struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); + + bio_alloc_cache_prune(cache, -1U); + } + return 0; +} + +static void bio_alloc_cache_destroy(struct bio_set *bs) +{ + int cpu; + + if (!bs->cache) + return; + + cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + for_each_possible_cpu(cpu) { + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bs->cache, cpu); + bio_alloc_cache_prune(cache, -1U); + } + free_percpu(bs->cache); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -629,16 +681,23 @@ void guard_bio_eod(struct bio *bio) **/ void bio_put(struct bio *bio) { - if (!bio_flagged(bio, BIO_REFFED)) - bio_free(bio); - else { + if (unlikely(bio_flagged(bio, BIO_REFFED))) { BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + if (!atomic_dec_and_test(&bio->__bi_cnt)) + return; + } - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->__bi_cnt)) - bio_free(bio); + if (bio_flagged(bio, BIO_PERCPU_CACHE)) { + struct bio_alloc_cache *cache; + + bio_uninit(bio); + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + bio_list_add_head(&cache->free_list, bio); + if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) + bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); + put_cpu(); + } else { + bio_free(bio); } } EXPORT_SYMBOL(bio_put); @@ -1530,6 +1589,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries) */ void bioset_exit(struct bio_set *bs) { + bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; @@ -1591,12 +1651,18 @@ int bioset_init(struct bio_set *bs, biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; - if (!(flags & BIOSET_NEED_RESCUER)) - return 0; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; + if (flags & BIOSET_NEED_RESCUER) { + bs->rescue_workqueue = alloc_workqueue("bioset", + WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + } + if (flags & BIOSET_PERCPU_CACHE) { + bs->cache = alloc_percpu(struct bio_alloc_cache); + if (!bs->cache) + goto bad; + cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + } return 0; bad: @@ -1623,6 +1689,43 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); +/** + * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb + * @kiocb: kiocb describing the IO + * @bs: bio_set to allocate from + * + * Description: + * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only + * used to check if we should dip into the per-cpu bio_set allocation + * cache. The allocation uses GFP_KERNEL internally. + * + */ +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) + return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); + + cache = per_cpu_ptr(bs->cache, get_cpu()); + bio = bio_list_pop(&cache->free_list); + if (bio) { + cache->nr--; + put_cpu(); + bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); + bio->bi_pool = bs; + bio_set_flag(bio, BIO_PERCPU_CACHE); + return bio; + } + put_cpu(); + bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); + bio_set_flag(bio, BIO_PERCPU_CACHE); + return bio; +} +EXPORT_SYMBOL_GPL(bio_alloc_kiocb); + static int __init init_bio(void) { int i; @@ -1637,6 +1740,9 @@ static int __init init_bio(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } + cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, + bio_cpu_dead); + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2203b686e1f0..89ad28213b1d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -401,6 +401,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, enum { BIOSET_NEED_BVECS = BIT(0), BIOSET_NEED_RESCUER = BIT(1), + BIOSET_PERCPU_CACHE = BIT(2), }; extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); extern void bioset_exit(struct bio_set *); @@ -409,6 +410,8 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, struct bio_set *bs); +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, + struct bio_set *bs); struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); @@ -699,6 +702,11 @@ struct bio_set { struct kmem_cache *bio_slab; unsigned int front_pad; + /* + * per-cpu bio alloc cache + */ + struct bio_alloc_cache __percpu *cache; + mempool_t bio_pool; mempool_t bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -715,6 +723,11 @@ struct bio_set { struct bio_list rescue_list; struct work_struct rescue_work; struct workqueue_struct *rescue_workqueue; + + /* + * Hot un-plug notifier for the per-cpu cache, if used + */ + struct hlist_node cpuhp_dead; }; static inline bool bioset_initialized(struct bio_set *bs) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 290f9061b29a..f68d4e8c775e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -301,6 +301,7 @@ enum { BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */ BIO_FLAG_LAST }; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f39b34b13871..fe72c8d6c980 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -46,6 +46,7 @@ enum cpuhp_state { CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_IRQ_POLL_DEAD, CPUHP_BLOCK_SOFTIRQ_DEAD, + CPUHP_BIO_DEAD, CPUHP_ACPI_CPUDRV_DEAD, CPUHP_S390_PFAULT_DEAD, CPUHP_BLK_MQ_DEAD, From be863b9e4348a791e360d25611a1bdde2c9595ed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Aug 2021 10:19:06 -0600 Subject: [PATCH 4/8] block: clear BIO_PERCPU_CACHE flag if polling isn't supported The bio alloc cache relies on the fact that a polled bio will complete in process context, clear the cacheable flag if we disable polling for a given bio. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4f8449b29b21..0d4d6b1e5d25 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -832,8 +832,11 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) { + /* can't support alloc cache if we turn off polling */ + bio_clear_flag(bio, BIO_PERCPU_CACHE); bio->bi_opf &= ~REQ_HIPRI; + } switch (bio_op(bio)) { case REQ_OP_DISCARD: From 394918ebb889f99d89db6843bcc93279b2b745f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Mar 2021 11:40:23 -0700 Subject: [PATCH 5/8] io_uring: enable use of bio alloc cache Mark polled IO as being safe for dipping into the bio allocation cache, in case the targeted bio_set has it enabled. This brings an IOPOLL gen2 Optane QD=128 workload from ~3.2M IOPS to ~3.5M IOPS. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4f5a00707644..504aede8ca47 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2737,7 +2737,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) !kiocb->ki_filp->f_op->iopoll) return -EOPNOTSUPP; - kiocb->ki_flags |= IOCB_HIPRI; + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; } else { From 01cfa28af486c9df3775232f10c3dd7ba2e88318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Aug 2021 09:05:52 -0600 Subject: [PATCH 6/8] block: use the percpu bio cache in __blkdev_direct_IO Use bio_alloc_kiocb to dip into the percpu cache of bios when the caller asks for it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..3c7fb7106713 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -385,7 +385,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -513,7 +513,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init); From 270a1c913ebd745ebee716af5f7215e1c2b30cc0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Aug 2021 11:42:53 -0600 Subject: [PATCH 7/8] block: provide bio_clear_hipri() helper Any case that turns off REQ_HIPRI must also clear BIO_PERCPU_CACHE, as non-polled IO may complete through hard/soft IRQ and hence isn't safe for our polled bio alloc cache. Provide a helper that does just that, and use it in the merging code as well if we split a bio and turn off polling. Fixes: be863b9e4348 ("block: clear BIO_PERCPU_CACHE flag if polling isn't supported") Reported-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ++----- block/blk-merge.c | 2 +- block/blk.h | 7 +++++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0d4d6b1e5d25..f35d401e65f8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -832,11 +832,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) { - /* can't support alloc cache if we turn off polling */ - bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_HIPRI; - } + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + bio_clear_hipri(bio); switch (bio_op(bio)) { case REQ_OP_DISCARD: diff --git a/block/blk-merge.c b/block/blk-merge.c index a11b3b53717e..bc25ad409fc1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -285,7 +285,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * iopoll in direct IO routine. Given performance gain of iopoll for * big IO can be trival, disable iopoll when split needed. */ - bio->bi_opf &= ~REQ_HIPRI; + bio_clear_hipri(bio); return bio_split(bio, sectors, GFP_NOIO, bs); } diff --git a/block/blk.h b/block/blk.h index cb01429c162c..5a4652a10931 100644 --- a/block/blk.h +++ b/block/blk.h @@ -364,4 +364,11 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; +static inline void bio_clear_hipri(struct bio *bio) +{ + /* can't support alloc cache if we turn off polling */ + bio_clear_flag(bio, BIO_PERCPU_CACHE); + bio->bi_opf &= ~REQ_HIPRI; +} + #endif /* BLK_INTERNAL_H */ From 3d5b3fbedad65088ec079a4c4d1a2f47e11ae1e7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Aug 2021 07:53:09 -0600 Subject: [PATCH 8/8] bio: improve kerneldoc documentation for bio_alloc_kiocb() We're missing a description for the 'nr_vecs' parameter. While in there, clarify that freeing a bio allocated through this function must be done from process context. Fixes: 1cbbd31c4ada ("bio: add allocation cache abstraction") Reported-by: Stephen Rothwell Signed-off-by: Jens Axboe --- block/bio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index dbb0bc8e1ef7..ef88fa3afe4c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1692,12 +1692,15 @@ EXPORT_SYMBOL(bioset_init_from_src); /** * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb * @kiocb: kiocb describing the IO + * @nr_iovecs: number of iovecs to pre-allocate * @bs: bio_set to allocate from * * Description: * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only * used to check if we should dip into the per-cpu bio_set allocation - * cache. The allocation uses GFP_KERNEL internally. + * cache. The allocation uses GFP_KERNEL internally. On return, the + * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio + * MUST be done from process context, not hard/soft IRQ. * */ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,